diff --git a/vanir/code_extractors/BUILD.bazel b/vanir/code_extractors/BUILD.bazel index f8a5288..b355a22 100644 --- a/vanir/code_extractors/BUILD.bazel +++ b/vanir/code_extractors/BUILD.bazel @@ -29,6 +29,7 @@ py_library( ":code_extractor_android", ":code_extractor_base", ":code_extractor_git", + ":code_extractor_linux", "//:vulnerability", requirement("requests"), ], @@ -57,6 +58,17 @@ py_library( ], ) +py_library( + name = "code_extractor_linux", + srcs = ["code_extractor_linux.py"], + deps = [ + ":code_extractor_base", + ":git_commit", + ":gitiles_commit", + "//:vulnerability", + ], +) + py_library( name = "gitiles_commit", srcs = ["gitiles_commit.py"], @@ -134,6 +146,21 @@ py_test( ], ) +py_test( + name = "code_extractor_linux_test", + srcs = ["code_extractor_linux_test.py"], + data = [ + "//vanir/testdata:test_patch_set", + ], + deps = [ + ":code_extractor_linux", + ":code_extractor_base", + ":git_commit", + "//:vulnerability", + requirement("absl-py"), + ], +) + py_test( name = "gitiles_commit_test", srcs = ["gitiles_commit_test.py"], diff --git a/vanir/code_extractors/code_extractor.py b/vanir/code_extractors/code_extractor.py index c4dc133..315e530 100644 --- a/vanir/code_extractors/code_extractor.py +++ b/vanir/code_extractors/code_extractor.py @@ -20,6 +20,7 @@ from vanir.code_extractors import code_extractor_android from vanir.code_extractors import code_extractor_base from vanir.code_extractors import code_extractor_git +from vanir.code_extractors import code_extractor_linux # pylint: enable=unused-import _P = TypeVar('_P', bound=code_extractor_base.AbstractCodeExtractor) diff --git a/vanir/code_extractors/code_extractor_linux.py b/vanir/code_extractors/code_extractor_linux.py new file mode 100644 index 0000000..1a943db --- /dev/null +++ b/vanir/code_extractors/code_extractor_linux.py @@ -0,0 +1,87 @@ +# Copyright 2023 Google LLC +# +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file or at +# https://developers.google.com/open-source/licenses/bsd + +"""Code extractors for Linux ecosystem packages of OSV CVEs. +""" + +import functools +import logging +from typing import Collection, FrozenSet, Mapping, Sequence, Tuple + +from vanir import vulnerability +from vanir.code_extractors import code_extractor_base +from vanir.code_extractors import git_commit +from vanir.code_extractors import gitiles_commit + + +@functools.cache +def _generate_commit(url: str, **kwargs) -> code_extractor_base.Commit: + """Generates Commit object for the given URL. + Args: + url: a URL pointing a commit of a known source repo. + **kwargs: additional arguments to pass to the constructor of each Commit. + Returns: + A commit object containing all patches and files extracted from |url|. + Raises: + CommitDataFetchError: when fails to extract valid commit data from |url|. + ValueError: when the given URL is malformatted or not compatible with any + known source repos. + """ + known_commit_classes = [ + gitiles_commit.GitilesCommit, + git_commit.GitCommit, + ] + for commit_class in known_commit_classes: + try: + return commit_class(url, **kwargs) + except ( + code_extractor_base.IncompatibleUrlError, + code_extractor_base.CommitDataFetchError, + ): + continue + raise ValueError('Unknown commit URL: %s' % url) + + +class LinuxCodeExtractor(code_extractor_base.AbstractCodeExtractor): + """Code extractor for Linux affected packages.""" + + @classmethod + def is_supported_ecosystem(cls, ecosystem: str) -> bool: + return 'Debian' in ecosystem or 'Linux' in ecosystem + + def extract_commits_for_affected_entry( + self, + affected: vulnerability.AffectedEntry, + **kwargs, + ) -> Tuple[ + Sequence[code_extractor_base.Commit], + Sequence[code_extractor_base.FailedCommitUrl], + ]: + fix_urls = affected.ecosystem_specific.get('fixes', []) + commits = [] + failed_commit_urls = [] + for fix_url in fix_urls: + logging.info('Analyzing fix: %s', fix_url) + try: + commit = _generate_commit(fix_url, **kwargs) + commits.append(commit) + except (ValueError, code_extractor_base.CommitDataFetchError) as e: + failed_commit_urls.append( + code_extractor_base.FailedCommitUrl(fix_url, e) + ) + return (commits, failed_commit_urls) + + def extract_files_at_tip_of_unaffected_versions( + self, + package_name: str, + affected_versions: Collection[str], + files: Collection[str], + **kwargs, + ) -> Tuple[ + Sequence[code_extractor_base.Commit], + Sequence[code_extractor_base.FailedCommitUrl], + ]: + return ([], []) diff --git a/vanir/code_extractors/code_extractor_linux_test.py b/vanir/code_extractors/code_extractor_linux_test.py new file mode 100644 index 0000000..1cdab0b --- /dev/null +++ b/vanir/code_extractors/code_extractor_linux_test.py @@ -0,0 +1,113 @@ +# Copyright 2023 Google LLC +# +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file or at +# https://developers.google.com/open-source/licenses/bsd + +from unittest import mock + +from vanir import vulnerability +from vanir.code_extractors import code_extractor_linux +from vanir.code_extractors import code_extractor_base +from vanir.code_extractors import git_commit + +from absl.testing import absltest +from absl.testing import parameterized + +class CodeExtractorLinuxTest(parameterized.TestCase): + + def setUp(self): + super().setUp() + # special mock for git operations done in GitCommit's constructor + # return value must not be empty + self.enter_context( + mock.patch.object(git_commit.GitCommit, '_run_git', autospec=True) + ).return_value = b"mock-result" + + def test_commit_init_with_unknown_commit_url(self): + bad_url = 'https://unsupported.kernel.patch.source.com/blah' + affected = vulnerability.AffectedEntry({ + 'package': {'ecosystem': 'Linux', 'name': 'Kernel'}, + 'ecosystem_specific': {'fixes': [bad_url]}, + }) + extractor = code_extractor_linux.LinuxCodeExtractor() + commits, failures = extractor.extract_commits_for_affected_entry(affected) + self.assertEmpty(commits) + self.assertLen(failures, 1) + self.assertEqual(failures[0].url, bad_url) + self.assertIsInstance(failures[0].error, ValueError) + + def test_different_packages(self): + packages = ( + {'ecosystem': 'Linux', 'name': 'Kernel'}, + {'ecosystem': 'Debian:11', 'name': 'linux'} + ) + for package in packages: + affected = vulnerability.AffectedEntry({ + 'package': package, + 'ecosystem_specific': {'fixes': [ + 'https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1234567', + ]}, + }) + extractor = code_extractor_linux.LinuxCodeExtractor() + commits, failures = extractor.extract_commits_for_affected_entry(affected) + self.assertLen(failures, 0) + + def test_extractor_with_multiple_fixes_and_failures(self): + affected = vulnerability.AffectedEntry({ + 'package': {'ecosystem': 'Linux', 'name': 'Kernel'}, + 'ecosystem_specific': {'fixes': [ + 'https://git.kernel.org/linus/1234567', + 'https://git.kernel.org/stable/c/1234567', + 'https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1234567', + 'https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git@1234567', + 'https://github.com/torvalds/linux/commit/1234567', + 'https://unsupported.kernel.patch.source.com/blah', + ]}, + }) + extractor = code_extractor_linux.LinuxCodeExtractor() + commits, failures = extractor.extract_commits_for_affected_entry(affected) + self.assertLen(failures, 1) + self.assertEqual( + failures[0].url, + 'https://unsupported.kernel.patch.source.com/blah' + ) + self.assertLen(commits, 5) + for commit in commits: + self.assertIsInstance(commit, git_commit.GitCommit) + self.assertEqual(commit._rev, "1234567") + + self.assertEqual( + commits[0]._remote, + 'https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git' + ) + self.assertEqual( + commits[1]._remote, + 'https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git' + ) + self.assertEqual( + commits[2]._remote, + 'https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git' + ) + self.assertEqual( + commits[3]._remote, + 'https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git' + ) + self.assertEqual( + commits[4]._remote, + 'https://github.com/torvalds/linux' + ) + + def test_extract_with_empty_patch(self): + affected = vulnerability.AffectedEntry({ + 'package': {'ecosystem': 'Linux', 'name': 'Kernel'}, + 'ecosystem_specific': {'fixes': []}, + }) + + extractor = code_extractor_linux.LinuxCodeExtractor() + commits, failures = extractor.extract_commits_for_affected_entry(affected) + self.assertEmpty(commits) + self.assertEmpty(failures) + +if __name__ == '__main__': + absltest.main() diff --git a/vanir/code_extractors/git_commit.py b/vanir/code_extractors/git_commit.py index 151c109..0bdfb78 100644 --- a/vanir/code_extractors/git_commit.py +++ b/vanir/code_extractors/git_commit.py @@ -27,6 +27,15 @@ _GENERIC_URL_PATTERN = re.compile( r'(?P[^:]+://[^/]+/.+)/(?P[^/]+)' ) +# https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=382c27f4ed28f803b1f1473ac2d8db0afc795a1b +_LINUX_KERNEL_PATTERN = re.compile( + r'(?P[^:]+://git.kernel.org/pub/scm/linux/kernel/git/[^/]+/[^.]+.git)/commit/\?id=(?P[^/]+)' +) +# https://git.kernel.org/linus/1eff70a9abd46f175defafd29bc17ad456f398a7 +# https://git.kernel.org/stable/c/47f82395f04a976d4fa97de7f2acffa1c1096571 +_LINUX_KERNEL_PATTERN_SHORT = re.compile( + r'(?P[^:]+://git.kernel.org/)(?P[^/]+)/(c/)?(?P[^/]+)' +) @functools.cache @@ -36,11 +45,22 @@ def _parse_url(url: str) -> Tuple[str, str]: _NORMALIZED_URL_PATTERN, _GITILES_URL_PATTERN, _GITHUB_URL_PATTERN, + _LINUX_KERNEL_PATTERN, + _LINUX_KERNEL_PATTERN_SHORT, _GENERIC_URL_PATTERN, ): match = pattern.fullmatch(url) if match: - return (match.group('remote'), match.group('rev')) + if pattern == _LINUX_KERNEL_PATTERN_SHORT: + # the shortened url does not contain the complete remote + if match.group('name') == 'linus': + remote = 'https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git' + return (remote, match.group('rev')) + elif match.group('name') == 'stable': + remote = 'https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git' + return (remote, match.group('rev')) + else: + return (match.group('remote'), match.group('rev')) raise code_extractor_base.IncompatibleUrlError(f'Unrecognized git URL: {url}') @@ -120,7 +140,15 @@ def __init__( self._run_git(['config', '--add', 'gc.auto', '0']) for src, dest in git_instead_ofs: self._run_git(['config', '--add', f'url.{dest}.insteadOf', src]) - self._fetch() + if self._rev_valid(): + self._fetch() + else: + # possible failure reason is a shortened commit hash + # which can be fixed by fetching all revisions + logging.info( + 'Revision is not valid. Is the commit hash shortened? Fetching all sources.' + ) + self._fetch_all() parents = self._run_git( ['rev-parse', f'{self._rev}^@'] ).decode('utf-8').strip().split() @@ -138,6 +166,22 @@ def _fetch(self): self._remote, self._rev, ]) + def _fetch_all(self): + return self._run_git_with_retry([ + 'fetch', '--quiet', '--no-tags', + self._remote, + ]) + + def _rev_valid(self) -> bool: + try: + self._run_git( + ['rev-parse', f'{self._rev}'] + ) + except code_extractor_base.CommitDataFetchError as e: + return False + else: + return True + def _normalize_url(self) -> str: # Validation is already done in __init__(), inside _parse_url(). if 'github.com' in self._remote: diff --git a/vanir/vulnerability.py b/vanir/vulnerability.py index eed28e4..1303145 100644 --- a/vanir/vulnerability.py +++ b/vanir/vulnerability.py @@ -16,7 +16,7 @@ from typing import Any, Dict, List, Mapping, Optional, Sequence, Union from google.protobuf import json_format from osv import vulnerability_pb2 - +from absl import logging from vanir import signature OSV_AFFECTED = 'affected' @@ -228,7 +228,73 @@ def __init__( # Convert all affected entries to AffectedEntry objects. self.affected = [] - for osv_affected in osv_vuln.get(OSV_AFFECTED, []): + + # Collect information about signatures and fixes + contains_signatures = False + contains_fixes = False + osv_affected_entries = osv_vuln.get(OSV_AFFECTED, []) + for osv_affected in osv_affected_entries: + for signature_location in (OSV_ECOSYSTEM_SPECIFIC, OSV_DATABASE_SPECIFIC): + if ( + signature_location in osv_affected + and 'vanir_signatures' in osv_affected[signature_location] + ): + contains_signatures = True + if ( + OSV_ECOSYSTEM_SPECIFIC in osv_affected + and 'fixes' in osv_affected[OSV_ECOSYSTEM_SPECIFIC] + ): + contains_fixes = True + + # Extract potentially useful patch information from references field if + # affected field is not present or existing affected fields do not contain fixes + if not contains_signatures and self.references is not None: + from vanir.code_extractors.git_commit import _LINUX_KERNEL_PATTERN + from vanir.code_extractors.git_commit import _LINUX_KERNEL_PATTERN_SHORT + if len(osv_affected_entries) == 0: + logging.debug('The osv vulnerability does not contain any affected field') + # No affected entries, so we use the url to deduce the ecosystem + ecosystem = 'unknown' + for reference in self.references: + # Problem of missing affected entries mainly exists for Linux + if ( + _LINUX_KERNEL_PATTERN.fullmatch(reference.get('url')) + or _LINUX_KERNEL_PATTERN_SHORT.fullmatch(reference.get('url')) + ): + ecosystem = 'Linux' + break + if ecosystem != 'unknown': + dummy_entry = { + 'package': {'name': 'dummy_entry', 'ecosystem': ecosystem} + } + osv_affected_entries.append(dummy_entry) + if not contains_fixes and len(osv_affected_entries) > 0: + # OSV vulnerability contains no fixes. Remove all affected_entries except for the first one and add references. + # Entries without fixes are not useful for signature creation anyway and we do not want multiple entries + # with the same signatures. + # Without these modifications we could not use this vulnerability at all. + logging.debug( + 'The osv vulnerability does not contain a fixes field. Trying to use data from references field instead.' + ) + fix_urls = [] + for reference in self.references: + # Some urls don't have the right type (FIX') + if ( + reference.get('type', []) == 'FIX' + or _LINUX_KERNEL_PATTERN.fullmatch(reference.get('url', '')) + or _LINUX_KERNEL_PATTERN_SHORT.fullmatch(reference.get('url', '')) + ): + fix_urls.append(reference.get('url')) + if fix_urls: + affected_entry = osv_affected_entries[0] + if not OSV_ECOSYSTEM_SPECIFIC in affected_entry: + affected_entry[OSV_ECOSYSTEM_SPECIFIC] = {} + affected_entry[OSV_ECOSYSTEM_SPECIFIC]['fixes'] = fix_urls + osv_affected_entries = [affected_entry] + else: + osv_affected_entries = osv_vuln.get(OSV_AFFECTED, []) + + for osv_affected in osv_affected_entries: self.affected.append( AffectedEntry(osv_affected, store_signatures_in_legacy_location) ) diff --git a/vanir/vulnerability_manager.py b/vanir/vulnerability_manager.py index 48775c9..af8f4bf 100644 --- a/vanir/vulnerability_manager.py +++ b/vanir/vulnerability_manager.py @@ -303,7 +303,7 @@ class SignatureFilter(VulnerabilityFilter): def _should_filter_out(self, sign: signature.Signature, context_package: vulnerability.AffectedEntry, context_vul: vulnerability.Vulnerability) -> bool: - """Decides if the given |signautre| should be filtered out or not. + """Decides if the given |signature| should be filtered out or not. Args: sign: the signature to be tested. @@ -540,6 +540,9 @@ def generate_signatures( # Mapping from package -> list of patchsets for each affected version. patch_series = collections.defaultdict(list) for i, affected_entry in enumerate(vuln.affected): + if len(affected_entry.vanir_signatures) > 0: + logging.info('Affected entry already contains vanir signatures. Skipping.') + continue try: commits, failed_urls = code_extractor.extract_for_affected_entry( affected_entry, session @@ -556,6 +559,10 @@ def generate_signatures( osv_id, failed_url.url, failed_url.error, ) if not commits: + logging.debug( + 'Could not extract any commits for %s in affected entry %s. Skipping.', + osv_id, i, + ) continue patch_series[ @@ -659,6 +666,17 @@ def generate_signatures( affected_entry.vanir_signatures = list(refined_signatures) affected_entry.sort_vanir_signatures() + # Remove already consumed tmp files to free up some space. When processing + # many CVEs, /tmp will fill up too much otherwise. + for (_,_), patchsets in patch_series.items(): + for commits in patchsets: + for commit in commits: + if hasattr(commit, '_working_root_dir_obj'): + del commit._working_root_dir_obj + if hasattr(commit, '_working_dir'): + del commit._working_dir + del patch_series + @property def affected_package_names(self) -> Collection[str]: """All affected package names of vulnerabilities in this manager. diff --git a/vanir/vulnerability_manager_test.py b/vanir/vulnerability_manager_test.py index b88bbca..21614be 100644 --- a/vanir/vulnerability_manager_test.py +++ b/vanir/vulnerability_manager_test.py @@ -518,10 +518,12 @@ def test_vulnerability_manager_generate_signatures(self): self._test_osv_vuln ) - def test_vulnerability_manager_generate_signature_fails_when_sig_exists(self): + def test_vulnerability_manager_generate_signature_logs_message_when_sig_exists(self): manager = vulnerability_manager.VulnerabilityManager([self._test_osv_vuln]) - with self.assertRaisesRegex(ValueError, '.*already exists.*'): + expected_log = 'INFO:absl:Affected entry already contains vanir signatures. Skipping.' + with self.assertLogs() as logger: manager.generate_signatures() + self.assertIn(expected_log, logger.output) self.assertLen(manager.vulnerabilities, 1) self.assertEqual( manager.vulnerabilities[0].to_osv_dict(),