Skip to content

Commit

Permalink
Merge pull request #3423 from nexB/fix-package-scan-only-performance
Browse files Browse the repository at this point in the history
Fix package scan only performance
  • Loading branch information
AyanSinhaMahapatra committed Jun 7, 2023
2 parents 3282bc0 + 5d8db2c commit 94d4fe6
Show file tree
Hide file tree
Showing 37 changed files with 17,378 additions and 2,007 deletions.
12 changes: 11 additions & 1 deletion CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,17 @@ v32.1.0 (next, roadmap)
See https://github.com/nexB/scancode-toolkit/issues/1745


v32.0.3 - 2023-05-26
v32.0.4 - 2023-06-07
---------------------

This is a minor bugfix release with the following updates:

- Fixes a performance issue issue arising out of license detection
on files happening in a single-threaded process_codebase step when the
license CLI option is disabled for a package scan.
Reference: https://github.com/nexB/scancode-toolkit/pull/3423

v32.0.3 - 2023-06-06
---------------------

This is a minor bugfix release with the following updates:
Expand Down
2 changes: 1 addition & 1 deletion setup-mini.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = scancode-toolkit-mini
version = 32.0.3
version = 32.0.4
license = Apache-2.0 AND CC-BY-4.0 AND LicenseRef-scancode-other-permissive AND LicenseRef-scancode-other-copyleft

# description must be on ONE line https://github.com/pypa/setuptools/issues/1390
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = scancode-toolkit
version = 32.0.3
version = 32.0.4
license = Apache-2.0 AND CC-BY-4.0 AND LicenseRef-scancode-other-permissive AND LicenseRef-scancode-other-copyleft

# description must be on ONE line https://github.com/pypa/setuptools/issues/1390
Expand Down
8 changes: 8 additions & 0 deletions src/packagedcode/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

from commoncode import fileutils

from licensedcode.cache import build_spdx_license_expression
from licensedcode.cache import get_cache
from licensedcode.tokenize import query_tokenizer
from licensedcode.detection import detect_licenses
from licensedcode.detection import get_unknown_license_detection
Expand Down Expand Up @@ -122,6 +124,11 @@ def assemble(cls, package_data, resource, codebase, package_adder):
resource=resource,
codebase=codebase,
)
if package.declared_license_expression:
package.declared_license_expression_spdx = str(build_spdx_license_expression(
license_expression=package.declared_license_expression,
licensing=get_cache().licensing,
))

cls.assign_package_to_resources(
package=package,
Expand All @@ -132,6 +139,7 @@ def assemble(cls, package_data, resource, codebase, package_adder):

yield package


# we yield this as we do not want this further processed
yield resource

Expand Down
32 changes: 6 additions & 26 deletions src/packagedcode/licensing.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def logger_debug(*args):
return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args))


def add_referenced_license_matches_for_package(resource, codebase, no_licenses):
def add_referenced_license_matches_for_package(resource, codebase):
"""
Return an updated ``resource`` saving it in place, after adding new license
detections to the package manifests detected in this resource, following their
Expand Down Expand Up @@ -106,13 +106,7 @@ def add_referenced_license_matches_for_package(resource, codebase, no_licenses):
if not referenced_resource:
continue

if no_licenses:
referenced_license_detections = get_license_detection_mappings(
location=referenced_resource.location
)

else:
referenced_license_detections = referenced_resource.license_detections
referenced_license_detections = referenced_resource.license_detections

if referenced_license_detections:
modified = True
Expand Down Expand Up @@ -160,7 +154,7 @@ def add_referenced_license_matches_for_package(resource, codebase, no_licenses):
yield resource


def add_referenced_license_detection_from_package(resource, codebase, no_licenses):
def add_referenced_license_detection_from_package(resource, codebase):
"""
Return an updated ``resource`` saving it in place, after adding new license
matches (licenses and license_expressions) following their Rule
Expand Down Expand Up @@ -209,7 +203,6 @@ def add_referenced_license_detection_from_package(resource, codebase, no_license
sibling_license_detections, _le = get_license_detections_from_sibling_file(
resource=root_resource,
codebase=codebase,
no_licenses=no_licenses,
)
if TRACE:
logger_debug(
Expand Down Expand Up @@ -278,12 +271,10 @@ def add_referenced_license_detection_from_package(resource, codebase, no_license
yield resource


def add_license_from_sibling_file(resource, codebase, no_licenses):
def add_license_from_sibling_file(resource, codebase):
"""
Given a resource and it's codebase object, assign licenses to the package
detections in that resource, from the sibling files of it.
If `no_license` is True, then license scan (for resources) is disabled.
"""
if TRACE:
logger_debug(f'packagedcode.licensing: add_license_from_sibling_file: resource: {resource.path}')
Expand All @@ -303,7 +294,6 @@ def add_license_from_sibling_file(resource, codebase, no_licenses):
license_detections, license_expression = get_license_detections_from_sibling_file(
resource=resource,
codebase=codebase,
no_licenses=no_licenses,
)
if not license_detections:
return
Expand Down Expand Up @@ -333,13 +323,11 @@ def is_legal_or_readme(resource):
return False


def get_license_detections_from_sibling_file(resource, codebase, no_licenses):
def get_license_detections_from_sibling_file(resource, codebase):
"""
Return `license_detections`, a list of LicenseDetection objects and a
`license_expression`, given a resource and it's codebase object, from
the sibling files of the resource.
If `no_license` is True, then license scan (for resources) is disabled.
"""
siblings = []

Expand All @@ -357,15 +345,7 @@ def get_license_detections_from_sibling_file(resource, codebase, no_licenses):

license_detections = []
for sibling in siblings:
if no_licenses:
detections = get_license_detection_mappings(
location=sibling.location,
analysis=DetectionCategory.PACKAGE_ADD_FROM_SIBLING_FILE.value,
post_scan=True,
)
license_detections.extend(detections)
else:
license_detections.extend(sibling.license_detections)
license_detections.extend(sibling.license_detections)

if not license_detections:
return [], None
Expand Down
25 changes: 12 additions & 13 deletions src/packagedcode/plugin_package.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,17 +194,19 @@ def process_codebase(self, codebase, strip_root=False, **kwargs):
Also perform additional package license detection that depends on either
file license detection or the package detections.
"""
no_licenses = False
has_licenses = hasattr(codebase.root, 'license_detections')

# These steps add proper license detections to package_data and hence
# this is performed before top level packages creation
for resource in codebase.walk(topdown=False):
if not hasattr(resource, 'license_detections'):
no_licenses = True
if not has_licenses:
#TODO: Add the steps where we detect licenses from files for only a package scan
# in the multiprocessing get_package_data API function
continue

# If we don't detect license in package_data but there is license detected in file
# we add the license expression from the file to a package
modified = add_license_from_file(resource, codebase, no_licenses)
modified = add_license_from_file(resource, codebase)
if TRACE and modified:
logger_debug(f'packagedcode: process_codebase: add_license_from_file: modified: {modified}')

Expand All @@ -213,30 +215,30 @@ def process_codebase(self, codebase, strip_root=False, **kwargs):

# If there is referenced files in a extracted license statement, we follow
# the references, look for license detections and add them back
modified = list(add_referenced_license_matches_for_package(resource, codebase, no_licenses))
modified = list(add_referenced_license_matches_for_package(resource, codebase))
if TRACE and modified:
logger_debug(f'packagedcode: process_codebase: add_referenced_license_matches_for_package: modified: {modified}')

# If there is a LICENSE file on the same level as the manifest, and no license
# is detected in the package_data, we add the license from the file
modified = add_license_from_sibling_file(resource, codebase, no_licenses)
modified = add_license_from_sibling_file(resource, codebase)
if TRACE and modified:
logger_debug(f'packagedcode: process_codebase: add_license_from_sibling_file: modified: {modified}')

# Create codebase-level packages and dependencies
create_package_and_deps(codebase, strip_root=strip_root, **kwargs)

if not no_licenses:
if has_licenses:
# This step is dependent on top level packages
for resource in codebase.walk(topdown=False):
# If there is a unknown reference to a package we add the license
# from the package license detection
modified = list(add_referenced_license_detection_from_package(resource, codebase, no_licenses))
modified = list(add_referenced_license_detection_from_package(resource, codebase))
if TRACE and modified:
logger_debug(f'packagedcode: process_codebase: add_referenced_license_matches_from_package: modified: {modified}')


def add_license_from_file(resource, codebase, no_licenses):
def add_license_from_file(resource, codebase):
"""
Given a Resource, check if the detected package_data doesn't have license detections
and the file has license detections, and if so, populate the package_data license
Expand All @@ -248,10 +250,7 @@ def add_license_from_file(resource, codebase, no_licenses):
if not resource.is_file:
return

if no_licenses:
license_detections_file = get_license_detection_mappings(location=resource.location)
else:
license_detections_file = resource.license_detections
license_detections_file = resource.license_detections

if TRACE:
logger_debug(f'add_license_from_file: license_detections_file: {license_detections_file}')
Expand Down
5 changes: 3 additions & 2 deletions src/scancode_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,11 +132,12 @@ def _create_dir(location):
# 4. hardcoded This is the default, fallback version in case package is not installed or we
# do not have a proper version otherwise.
if not __version__:
__version__ = '32.0.3'
__version__ = '32.0.4'

#######################
# used to warn user when the version is out of date
__release_date__ = datetime.datetime(2023, 6, 6)
# this is (year, month, day)
__release_date__ = datetime.datetime(2023, 6, 7)

# See https://github.com/nexB/scancode-toolkit/issues/2653 for more information
# on the data format version
Expand Down
6 changes: 3 additions & 3 deletions tests/formattedcode/test_output_cyclonedx.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,22 +228,22 @@ def test_cyclonedx_plugin_does_not_fail_without_packages():
def test_cyclonedx_plugin_json():
test_dir = test_env.get_test_loc('cyclonedx/simple')
result_file = test_env.get_temp_file('cyclonedx.json')
run_scan_click(['-p', test_dir, '--cyclonedx', result_file])
run_scan_click(['--package', test_dir, '--cyclonedx', result_file])
expected_file = test_env.get_test_loc('cyclonedx/simple-expected.json')
check_cyclone_output(expected_file, result_file, regen=REGEN_TEST_FIXTURES)


def test_cyclonedx_plugin_json_simple_package_icu():
test_dir = test_env.get_test_loc('cyclonedx/simple-icu')
result_file = test_env.get_temp_file('cyclonedx.json')
run_scan_click(['-p', test_dir, '--cyclonedx', result_file])
run_scan_click(['--package', '--license', test_dir, '--cyclonedx', result_file])
expected_file = test_env.get_test_loc('cyclonedx/simple-icu-expected.json')
check_cyclone_output(expected_file, result_file, regen=REGEN_TEST_FIXTURES)


def test_cyclonedx_plugin_xml_components_and_dependencies_are_serialized_correctly():
test_dir = test_env.get_test_loc('cyclonedx/simple')
result_file = test_env.get_temp_file('cyclonedx.xml')
run_scan_click(['-p', test_dir, '--cyclonedx-xml', result_file])
run_scan_click(['--package', test_dir, '--cyclonedx-xml', result_file])
expected_file = test_env.get_test_loc('cyclonedx/expected.xml')
check_cyclone_xml_output(expected_file, result_file, regen=REGEN_TEST_FIXTURES)
26 changes: 3 additions & 23 deletions tests/packagedcode/data/build/buck/end2end-expected.json
Original file line number Diff line number Diff line change
Expand Up @@ -247,29 +247,9 @@
"vcs_url": null,
"copyright": null,
"holder": null,
"declared_license_expression": "apache-2.0",
"declared_license_expression_spdx": "Apache-2.0",
"license_detections": [
{
"license_expression": "apache-2.0",
"matches": [
{
"score": 100.0,
"start_line": 1,
"end_line": 1,
"matched_length": 3,
"match_coverage": 100.0,
"matcher": "1-hash",
"license_expression": "apache-2.0",
"rule_identifier": "spdx_license_id_apache-2.0_for_apache-2.0.RULE",
"rule_relevance": 100,
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/spdx_license_id_apache-2.0_for_apache-2.0.RULE",
"matched_text": "apache-2.0"
}
],
"identifier": "apache_2_0-d66ab77d-a5cc-7104-e702-dc7df61fe9e8"
}
],
"declared_license_expression": null,
"declared_license_expression_spdx": null,
"license_detections": [],
"other_license_expression": null,
"other_license_expression_spdx": null,
"other_license_detections": [],
Expand Down
Loading

0 comments on commit 94d4fe6

Please sign in to comment.