Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add an option to prefer ScanCode's file- over line-level findings #8152

Merged
merged 9 commits into from
Jan 29, 2024
3 changes: 3 additions & 0 deletions model/src/main/resources/reference.yml
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,9 @@ ort:
# Command line options that do not affect the ScanCode output.
commandLineNonConfig: '--processes 4'

# Use per-file license findings instead of per-line ones.
preferFileLicense: false

# Criteria for matching stored scan results. These can be configured for any scanner that uses semantic
# versioning. Note that the 'maxVersion' is exclusive and not part of the range of accepted versions.
minVersion: '3.2.1-rc2'
Expand Down
1 change: 1 addition & 0 deletions model/src/test/kotlin/config/OrtConfigurationTest.kt
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,7 @@ class OrtConfigurationTest : WordSpec({
options shouldContainExactly mapOf(
"commandLine" to "--copyright --license --info --strip-root --timeout 300",
"commandLineNonConfig" to "--processes 4",
"preferFileLicense" to "false",
"minVersion" to "3.2.1-rc2",
"maxVersion" to "32.0.0"
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ import org.ossreviewtoolkit.utils.spdx.getLicenseText
import org.ossreviewtoolkit.utils.test.ExpensiveTag

class ScanCodeScannerFunTest : AbstractPathScannerWrapperFunTest(setOf(ExpensiveTag)) {
override val scanner = ScanCode("ScanCode", ScanCodeConfig.EMPTY, ScannerWrapperConfig.EMPTY)
override val scanner = ScanCode("ScanCode", ScanCodeConfig.DEFAULT, ScannerWrapperConfig.EMPTY)

override val expectedFileLicenses = listOf(
LicenseFinding("Apache-2.0", TextLocation("LICENSE", 1, 187), 100.0f),
Expand Down
90 changes: 33 additions & 57 deletions plugins/scanners/scancode/src/main/kotlin/ScanCode.kt
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@ package org.ossreviewtoolkit.plugins.scanners.scancode
import java.io.File
import java.time.Instant

import kotlin.math.max

import org.apache.logging.log4j.kotlin.logger

import org.ossreviewtoolkit.model.ScanSummary
Expand All @@ -40,7 +38,6 @@ import org.ossreviewtoolkit.utils.common.Options
import org.ossreviewtoolkit.utils.common.Os
import org.ossreviewtoolkit.utils.common.ProcessCapture
import org.ossreviewtoolkit.utils.common.safeDeleteRecursively
import org.ossreviewtoolkit.utils.common.splitOnWhitespace
import org.ossreviewtoolkit.utils.common.withoutPrefix
import org.ossreviewtoolkit.utils.ort.createOrtTempDir

Expand All @@ -55,49 +52,30 @@ import org.semver4j.Semver
* configuration [options][PluginConfiguration.options]:
*
* * **"commandLine":** Command line options that modify the result. These are added to the [ScannerDetails] when
* looking up results from the [ScanResultsStorage]. Defaults to [DEFAULT_CONFIGURATION_OPTIONS].
* looking up results from the [ScanResultsStorage]. Defaults to [ScanCodeConfig.DEFAULT_COMMAND_LINE_OPTIONS].
* * **"commandLineNonConfig":** Command line options that do not modify the result and should therefore not be
* considered in [configuration], like "--processes". Defaults to [DEFAULT_NON_CONFIGURATION_OPTIONS].
* considered in [configuration], like "--processes". Defaults to
* [ScanCodeConfig.DEFAULT_COMMAND_LINE_NON_CONFIG_OPTIONS].
* * **preferFileLicense**: A flag to indicate whether the "high-level" per-file license reported by ScanCode starting
* with version 32 should be used instead of the individual "low-level" per-line license findings. The per-file
* license may be different from the conjunction of per-line licenses and is supposed to contain fewer
* false-positives. However, no exact line numbers can be associated to the per-file license anymore. If enabled, the
* start line of the per-file license finding is set to the minimum of all start lines for per-line findings in that
* file, the end line is set to the maximum of all end lines for per-line findings in that file, and the score is set
* to the arithmetic average of the scores of all per-line findings in that file.
*/
class ScanCode internal constructor(
name: String,
config: ScanCodeConfig,
private val config: ScanCodeConfig,
private val wrapperConfig: ScannerWrapperConfig
) : CommandLinePathScannerWrapper(name) {
// This constructor is required by the `RequirementsCommand`.
constructor(name: String, wrapperConfig: ScannerWrapperConfig) : this(name, ScanCodeConfig.EMPTY, wrapperConfig)
constructor(name: String, wrapperConfig: ScannerWrapperConfig) : this(name, ScanCodeConfig.DEFAULT, wrapperConfig)

companion object {
const val SCANNER_NAME = "ScanCode"

private const val LICENSE_REFERENCES_OPTION_VERSION = "32.0.0"
private const val OUTPUT_FORMAT = "json-pp"
sschuberth marked this conversation as resolved.
Show resolved Hide resolved
private const val TIMEOUT = 300

/**
* Configuration options that are relevant for [configuration] because they change the result file.
*/
private val DEFAULT_CONFIGURATION_OPTIONS = listOf(
"--copyright",
"--license",
"--info",
"--strip-root",
"--timeout", TIMEOUT.toString()
)

/**
* Configuration options that are not relevant for [configuration] because they do not change the result
* file.
*/
private val DEFAULT_NON_CONFIGURATION_OPTIONS = listOf(
"--processes", max(1, Runtime.getRuntime().availableProcessors() - 1).toString()
)

private val OUTPUT_FORMAT_OPTION = if (OUTPUT_FORMAT.startsWith("json")) {
"--$OUTPUT_FORMAT"
} else {
"--output-$OUTPUT_FORMAT"
}
}

class Factory : ScannerWrapperFactory<ScanCodeConfig>(SCANNER_NAME) {
Expand All @@ -107,35 +85,33 @@ class ScanCode internal constructor(
override fun parseConfig(options: Options, secrets: Options) = ScanCodeConfig.create(options)
}

override val matcher by lazy { ScannerMatcher.create(details, wrapperConfig.matcherConfig) }
fviernau marked this conversation as resolved.
Show resolved Hide resolved

override val readFromStorage by lazy { wrapperConfig.readFromStorageWithDefault(matcher) }

override val writeToStorage by lazy { wrapperConfig.writeToStorageWithDefault(matcher) }

override val configuration by lazy {
buildList {
addAll(configurationOptions)
add(OUTPUT_FORMAT_OPTION)
}.joinToString(" ")
}

private val configurationOptions = config.commandLine?.splitOnWhitespace() ?: DEFAULT_CONFIGURATION_OPTIONS
private val nonConfigurationOptions = config.commandLineNonConfig?.splitOnWhitespace()
?: DEFAULT_NON_CONFIGURATION_OPTIONS
private val commandLineOptions by lazy { getCommandLineOptions(version) }

internal fun getCommandLineOptions(version: String) =
buildList {
addAll(configurationOptions)
addAll(nonConfigurationOptions)
addAll(config.commandLine)
addAll(config.commandLineNonConfig)
fviernau marked this conversation as resolved.
Show resolved Hide resolved

if (Semver(version).isGreaterThanOrEqualTo(LICENSE_REFERENCES_OPTION_VERSION)) {
// Required to be able to map ScanCode license keys to SPDX IDs.
add("--license-references")
}
}

val commandLineOptions by lazy { getCommandLineOptions(version) }
override val configuration by lazy {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding the per file findings as entries into the text location based license findings which correspond to a line range an a specific file, also implies that the existing license finding curation mechanism becomes applicable to these per file entries.

So, from the perspective of creating the license finding curations, I wonder about the implication such as would we accept (e.g. in ort-config) also license finding curations which match such per file license findings?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would we accept (e.g. in ort-config) also license finding curations which match such per file license findings?

I think we should, mostly for two reasons:

  • there's hardly any negative impact compared to not doing so,
  • we should also accept curations for other supported scanners than ScanCode, and these other scanners could theoretically find exactly the file-level finding that ScanCode finds. If we'd accept such a curation for another scanner, we should accept it for ScanCode, too. (This somewhat reminds me of @mnonnenmacher's remark that our license finding curations should also include the scanner name and version.)

buildList {
addAll(config.commandLine)

// Add this in the style of a fake command line option for consistency with the above.
if (config.preferFileLicense) add("--prefer-file-license")
}.joinToString(" ")
}

override val matcher by lazy { ScannerMatcher.create(details, wrapperConfig.matcherConfig) }

override val readFromStorage by lazy { wrapperConfig.readFromStorageWithDefault(matcher) }

override val writeToStorage by lazy { wrapperConfig.writeToStorageWithDefault(matcher) }

override fun command(workingDir: File?) =
listOfNotNull(workingDir, if (Os.isWindows) "scancode.bat" else "scancode").joinToString(File.separator)
Expand Down Expand Up @@ -179,7 +155,7 @@ class ScanCode internal constructor(
}

override fun createSummary(result: String, startTime: Instant, endTime: Instant): ScanSummary =
parseResult(result).toScanSummary()
parseResult(result).toScanSummary(config.preferFileLicense)

/**
* Execute ScanCode with the configured arguments to scan the given [path] and produce [resultFile].
Expand All @@ -188,8 +164,8 @@ class ScanCode internal constructor(
ProcessCapture(
command(),
*commandLineOptions.toTypedArray(),
path.absolutePath,
OUTPUT_FORMAT_OPTION,
resultFile.absolutePath
// The output format option needs to directly precede the result file path.
"--json-pp", resultFile.absolutePath,
path.absolutePath
)
}
42 changes: 36 additions & 6 deletions plugins/scanners/scancode/src/main/kotlin/ScanCodeConfig.kt
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,49 @@

package org.ossreviewtoolkit.plugins.scanners.scancode

import kotlin.math.max
import kotlin.time.Duration.Companion.minutes

import org.ossreviewtoolkit.utils.common.Options
import org.ossreviewtoolkit.utils.common.splitOnWhitespace

data class ScanCodeConfig(
val commandLine: String?,
val commandLineNonConfig: String?
val commandLine: List<String>,
val commandLineNonConfig: List<String>,
val preferFileLicense: Boolean
) {
companion object {
val EMPTY = ScanCodeConfig(null, null)
/**
* The default time after which scanning a file is aborted.
*/
private val DEFAULT_TIMEOUT = 5.minutes

/**
* The default list of command line options that might have an impact on the scan results.
*/
private val DEFAULT_COMMAND_LINE_OPTIONS = listOf(
"--copyright",
"--license",
"--info",
"--strip-root",
fviernau marked this conversation as resolved.
Show resolved Hide resolved
"--timeout", "${DEFAULT_TIMEOUT.inWholeSeconds}"
)

/**
* The default list of command line options that cannot have an impact on the scan results.
*/
private val DEFAULT_COMMAND_LINE_NON_CONFIG_OPTIONS = listOf(
"--processes", max(1, Runtime.getRuntime().availableProcessors() - 1).toString()
)

private const val COMMAND_LINE_PROPERTY = "commandLine"
private const val COMMAND_LINE_NON_CONFIG_PROPERTY = "commandLineNonConfig"
val DEFAULT = create(emptyMap())

fun create(options: Options) =
ScanCodeConfig(options[COMMAND_LINE_PROPERTY], options[COMMAND_LINE_NON_CONFIG_PROPERTY])
ScanCodeConfig(
options["commandLine"]?.splitOnWhitespace() ?: DEFAULT_COMMAND_LINE_OPTIONS,
options["commandLineNonConfig"]?.splitOnWhitespace()
?: DEFAULT_COMMAND_LINE_NON_CONFIG_OPTIONS,
options["preferFileLicense"].toBoolean()
)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ private data class LicenseMatch(
val score: Float
)

fun ScanCodeResult.toScanSummary(): ScanSummary {
fun ScanCodeResult.toScanSummary(preferFileLicense: Boolean = false): ScanSummary {
val licenseFindings = mutableSetOf<LicenseFinding>()
val copyrightFindings = mutableSetOf<CopyrightFinding>()
val issues = mutableListOf<Issue>()
Expand Down Expand Up @@ -91,19 +91,31 @@ fun ScanCodeResult.toScanSummary(): ScanSummary {
it.value.first()
}

licenses.mapTo(licenseFindings) { license ->
// ScanCode uses its own license keys as identifiers in license expressions.
val spdxLicenseExpression = license.licenseExpression.mapLicense(scanCodeKeyToSpdxIdMappings)

LicenseFinding(
license = spdxLicenseExpression,
if (preferFileLicense && file is FileEntry.Version3 && file.detectedLicenseExpressionSpdx != null) {
licenseFindings += LicenseFinding(
license = file.detectedLicenseExpressionSpdx,
location = TextLocation(
path = file.path,
startLine = license.startLine,
endLine = license.endLine
startLine = licenses.minOf { it.startLine },
endLine = licenses.maxOf { it.endLine }
),
score = license.score
score = licenses.map { it.score }.average().toFloat()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would you know the rational DOS use when coming up with an avg score?
Also, I wonder in what way the CLI option --license-score influences the per file license? (and whether this should become a command line config option).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would you know the rational DOS use when coming up with an avg score?

@Etsija and @lamppu probably know better, but the average is pretty much the only metric that somewhat makes sense and is reasonably easy to calculate. Min. does not make sense, max. does not make sense.

Of course, you could do something "crazy" like only taking the scores of those line findings into account whose licenses are also contained in the file finding, and maybe even weight the score according to the number of lines matched, but that's much more work for questionable benefit.

I wonder in what way the CLI option --license-score influences the per file license?

Let's ask the author's: @pombredanne, does ScanCode's --license-score option have an (implicit) impact on which licenses will be part of the per-file detected_license_expression(_spdx) fields?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Of course, you could do something "crazy" like only taking the scores of those line findings into account whose licenses are also contained in the file finding, and maybe even weight the score according to the number of lines matched, but that's much more work for questionable benefit.

Another alternative would be to omit lines and score entirely, with the rational that they stop being meaningful.
The reason I'm picking this up is, because (1) I suspect these values are a bit more likely to exhibit some bit of flakyness at least across scancode version (2) the result of the heuristic gets backed into the stored scan results and is thus very expensive to change.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another alternative would be to omit lines and score entirely, with the rational that they stop being meaningful.

Making lines optional would be a bigger change in the data model, and it would prevent us from matching copyright to license findings, and probably have more implications.

However, I forgot that the scoreactually is optional already. Would you prefer to omit it instead of calculating the average?

I suspect these values are a bit more likely to exhibit some bit of flakyness at least across scancode version

More likely to be flaky than what, than the individual per-line findings? As the lines and score are calculated from the per-line findings, they can only be flaky if the per-line findings themselves are already flaky.

the result of the heuristic gets backed into the stored scan results and is thus very expensive to change.

Correct, but that's the way forward how ScanCode will operate, I guess. As this discussion shows, there seem to be no endeavors to enhance the quality of the per-line findings themselves, but to achieve better quality by post-processing these findings into the per-file finding.

Copy link
Member

@fviernau fviernau Jan 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct, but that's the way forward how ScanCode will operate, I guess. As this discussion shows, there seem to be no endeavors to enhance the quality of the per-line findings themselves, but to achieve better quality by post-processing these findings into the per-file finding.

I don't much about the processing inside of scancode, so I don't want to move this discussion off-topic,
but it's already interesting that this is actually a post-processing step of per-line findings.
If the only input of this post processing step is the a per-line findings, then it could
also / in the long run make more sense to have that postprocessing step applied by ORT (using ScanCodes logic), not by ScanCode (at least not in a single step) in order to make use of the per-line finding curations. The process could be:

  1. Compute per line findings with ScanCode
  2. Apply per line finding curations aka license finding curations
  3. Take result from 2. as input for the post processing to compute the per-file findings using ScanCode's heuristic
    feeding in the curated license findings.

I believe this would exhibit several advantages over the current solution.

Also, IMO it'd be nice that per line findings would no more be mutually exclusive from per file findings.
Curations would be possible on both levels of granularity.

@pombredanne - what do you think about this idea?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

make more sense to have that postprocessing step applied by ORT (using ScanCodes logic)

I agree, if that's possible, and ScanCode does not require internal data for this that's not written to the result file.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

However, I forgot that the scoreactually is optional already. Would you prefer to omit it instead of calculating the average?

@fviernau I still need you input here. I believe that's the last thing to clarify before we could merge?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd have slight tendency to omit it, but I also believe my knowledge is too limited in that area to make the call.
The reason I believe why its better to omit it, is because I believe the way the score would be defined would even harder to grasp and hardly be actionable. E.g. if an SPDX expression would consist of several 100% findings and a few 80% findings, the final score would have a relatively low score even though some parts of the expression apply a 100%. So,e.g. filtering out on score would not be feasible anymore, because one would have two different ways the score could have been computed.

So, all in all I prefer to omit it, but would be fine if you kept it. Up to you.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mnonnenmacher what do you think about #8152 (comment) ?

)
Comment on lines +94 to 103
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI @Etsija, @lamppu, @mmurto: This should resemble DOS's logic.

} else {
licenses.mapTo(licenseFindings) { license ->
// ScanCode uses its own license keys as identifiers in license expressions.
val spdxLicenseExpression = license.licenseExpression.mapLicense(scanCodeKeyToSpdxIdMappings)

LicenseFinding(
license = spdxLicenseExpression,
location = TextLocation(
path = file.path,
startLine = license.startLine,
endLine = license.endLine
),
score = license.score
)
}
}

file.copyrights.mapTo(copyrightFindings) { copyright ->
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import io.kotest.core.spec.style.FreeSpec
import io.kotest.matchers.Matcher
import io.kotest.matchers.collections.beEmpty
import io.kotest.matchers.collections.containExactlyInAnyOrder
import io.kotest.matchers.collections.shouldContainExactlyInAnyOrder
import io.kotest.matchers.collections.shouldHaveSingleElement
import io.kotest.matchers.collections.shouldHaveSize
import io.kotest.matchers.should
Expand Down Expand Up @@ -80,9 +81,25 @@ class ScanCodeResultParserTest : FreeSpec({

val summary = parseResult(resultFile).toScanSummary()

summary.licenseFindings.find {
it.location == TextLocation("README.md", 100) && it.score == 100.0f
}?.license.toString() shouldBe "GPL-2.0-only WITH GCC-exception-2.0"
with(summary.licenseFindings) {
shouldHaveSize(18)
fviernau marked this conversation as resolved.
Show resolved Hide resolved
find { it.location == TextLocation("README.md", 100) && it.score == 100.0f }
?.license.toString() shouldBe "GPL-2.0-only WITH GCC-exception-2.0"
}
}

"get file-level findings with the 'preferFileLicense' option" {
val resultFile = getAssetFile("scancode-32.0.8_spdx-expression-parse_no-license-references.json")

val summary = parseResult(resultFile).toScanSummary(preferFileLicense = true)

summary.licenseFindings.map { it.license.toString() }.shouldContainExactlyInAnyOrder(
"LicenseRef-scancode-generic-cla AND MIT",
"MIT",
"MIT",
"GPL-2.0-only WITH GCC-exception-2.0 AND JSON AND BSD-2-Clause AND CC-BY-3.0 AND MIT",
"GPL-2.0-only WITH GCC-exception-2.0 AND BSD-3-Clause"
)
}
}

Expand Down
Loading
Loading