From ea116447f8edb1b436057b1fac8d9300149c3307 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yannik=20Tannh=C3=A4user?= Date: Wed, 29 Oct 2025 15:40:45 +0100 Subject: [PATCH 1/4] Add Linux kernel scanning support Add code extractor and tests for Linux kernel signature generation. Expand git_commit class to support fetching kernel sources. Using the offline scanner and created signatures, one can scan the kernel source tree. --- vanir/code_extractors/BUILD.bazel | 27 +++++ vanir/code_extractors/code_extractor.py | 1 + vanir/code_extractors/code_extractor_linux.py | 87 ++++++++++++++ .../code_extractor_linux_test.py | 113 ++++++++++++++++++ vanir/code_extractors/git_commit.py | 22 +++- 5 files changed, 249 insertions(+), 1 deletion(-) create mode 100644 vanir/code_extractors/code_extractor_linux.py create mode 100644 vanir/code_extractors/code_extractor_linux_test.py diff --git a/vanir/code_extractors/BUILD.bazel b/vanir/code_extractors/BUILD.bazel index f8a5288..b355a22 100644 --- a/vanir/code_extractors/BUILD.bazel +++ b/vanir/code_extractors/BUILD.bazel @@ -29,6 +29,7 @@ py_library( ":code_extractor_android", ":code_extractor_base", ":code_extractor_git", + ":code_extractor_linux", "//:vulnerability", requirement("requests"), ], @@ -57,6 +58,17 @@ py_library( ], ) +py_library( + name = "code_extractor_linux", + srcs = ["code_extractor_linux.py"], + deps = [ + ":code_extractor_base", + ":git_commit", + ":gitiles_commit", + "//:vulnerability", + ], +) + py_library( name = "gitiles_commit", srcs = ["gitiles_commit.py"], @@ -134,6 +146,21 @@ py_test( ], ) +py_test( + name = "code_extractor_linux_test", + srcs = ["code_extractor_linux_test.py"], + data = [ + "//vanir/testdata:test_patch_set", + ], + deps = [ + ":code_extractor_linux", + ":code_extractor_base", + ":git_commit", + "//:vulnerability", + requirement("absl-py"), + ], +) + py_test( name = "gitiles_commit_test", srcs = ["gitiles_commit_test.py"], diff --git a/vanir/code_extractors/code_extractor.py b/vanir/code_extractors/code_extractor.py index c4dc133..315e530 100644 --- a/vanir/code_extractors/code_extractor.py +++ b/vanir/code_extractors/code_extractor.py @@ -20,6 +20,7 @@ from vanir.code_extractors import code_extractor_android from vanir.code_extractors import code_extractor_base from vanir.code_extractors import code_extractor_git +from vanir.code_extractors import code_extractor_linux # pylint: enable=unused-import _P = TypeVar('_P', bound=code_extractor_base.AbstractCodeExtractor) diff --git a/vanir/code_extractors/code_extractor_linux.py b/vanir/code_extractors/code_extractor_linux.py new file mode 100644 index 0000000..1a943db --- /dev/null +++ b/vanir/code_extractors/code_extractor_linux.py @@ -0,0 +1,87 @@ +# Copyright 2023 Google LLC +# +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file or at +# https://developers.google.com/open-source/licenses/bsd + +"""Code extractors for Linux ecosystem packages of OSV CVEs. +""" + +import functools +import logging +from typing import Collection, FrozenSet, Mapping, Sequence, Tuple + +from vanir import vulnerability +from vanir.code_extractors import code_extractor_base +from vanir.code_extractors import git_commit +from vanir.code_extractors import gitiles_commit + + +@functools.cache +def _generate_commit(url: str, **kwargs) -> code_extractor_base.Commit: + """Generates Commit object for the given URL. + Args: + url: a URL pointing a commit of a known source repo. + **kwargs: additional arguments to pass to the constructor of each Commit. + Returns: + A commit object containing all patches and files extracted from |url|. + Raises: + CommitDataFetchError: when fails to extract valid commit data from |url|. + ValueError: when the given URL is malformatted or not compatible with any + known source repos. + """ + known_commit_classes = [ + gitiles_commit.GitilesCommit, + git_commit.GitCommit, + ] + for commit_class in known_commit_classes: + try: + return commit_class(url, **kwargs) + except ( + code_extractor_base.IncompatibleUrlError, + code_extractor_base.CommitDataFetchError, + ): + continue + raise ValueError('Unknown commit URL: %s' % url) + + +class LinuxCodeExtractor(code_extractor_base.AbstractCodeExtractor): + """Code extractor for Linux affected packages.""" + + @classmethod + def is_supported_ecosystem(cls, ecosystem: str) -> bool: + return 'Debian' in ecosystem or 'Linux' in ecosystem + + def extract_commits_for_affected_entry( + self, + affected: vulnerability.AffectedEntry, + **kwargs, + ) -> Tuple[ + Sequence[code_extractor_base.Commit], + Sequence[code_extractor_base.FailedCommitUrl], + ]: + fix_urls = affected.ecosystem_specific.get('fixes', []) + commits = [] + failed_commit_urls = [] + for fix_url in fix_urls: + logging.info('Analyzing fix: %s', fix_url) + try: + commit = _generate_commit(fix_url, **kwargs) + commits.append(commit) + except (ValueError, code_extractor_base.CommitDataFetchError) as e: + failed_commit_urls.append( + code_extractor_base.FailedCommitUrl(fix_url, e) + ) + return (commits, failed_commit_urls) + + def extract_files_at_tip_of_unaffected_versions( + self, + package_name: str, + affected_versions: Collection[str], + files: Collection[str], + **kwargs, + ) -> Tuple[ + Sequence[code_extractor_base.Commit], + Sequence[code_extractor_base.FailedCommitUrl], + ]: + return ([], []) diff --git a/vanir/code_extractors/code_extractor_linux_test.py b/vanir/code_extractors/code_extractor_linux_test.py new file mode 100644 index 0000000..1cdab0b --- /dev/null +++ b/vanir/code_extractors/code_extractor_linux_test.py @@ -0,0 +1,113 @@ +# Copyright 2023 Google LLC +# +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file or at +# https://developers.google.com/open-source/licenses/bsd + +from unittest import mock + +from vanir import vulnerability +from vanir.code_extractors import code_extractor_linux +from vanir.code_extractors import code_extractor_base +from vanir.code_extractors import git_commit + +from absl.testing import absltest +from absl.testing import parameterized + +class CodeExtractorLinuxTest(parameterized.TestCase): + + def setUp(self): + super().setUp() + # special mock for git operations done in GitCommit's constructor + # return value must not be empty + self.enter_context( + mock.patch.object(git_commit.GitCommit, '_run_git', autospec=True) + ).return_value = b"mock-result" + + def test_commit_init_with_unknown_commit_url(self): + bad_url = 'https://unsupported.kernel.patch.source.com/blah' + affected = vulnerability.AffectedEntry({ + 'package': {'ecosystem': 'Linux', 'name': 'Kernel'}, + 'ecosystem_specific': {'fixes': [bad_url]}, + }) + extractor = code_extractor_linux.LinuxCodeExtractor() + commits, failures = extractor.extract_commits_for_affected_entry(affected) + self.assertEmpty(commits) + self.assertLen(failures, 1) + self.assertEqual(failures[0].url, bad_url) + self.assertIsInstance(failures[0].error, ValueError) + + def test_different_packages(self): + packages = ( + {'ecosystem': 'Linux', 'name': 'Kernel'}, + {'ecosystem': 'Debian:11', 'name': 'linux'} + ) + for package in packages: + affected = vulnerability.AffectedEntry({ + 'package': package, + 'ecosystem_specific': {'fixes': [ + 'https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1234567', + ]}, + }) + extractor = code_extractor_linux.LinuxCodeExtractor() + commits, failures = extractor.extract_commits_for_affected_entry(affected) + self.assertLen(failures, 0) + + def test_extractor_with_multiple_fixes_and_failures(self): + affected = vulnerability.AffectedEntry({ + 'package': {'ecosystem': 'Linux', 'name': 'Kernel'}, + 'ecosystem_specific': {'fixes': [ + 'https://git.kernel.org/linus/1234567', + 'https://git.kernel.org/stable/c/1234567', + 'https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1234567', + 'https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git@1234567', + 'https://github.com/torvalds/linux/commit/1234567', + 'https://unsupported.kernel.patch.source.com/blah', + ]}, + }) + extractor = code_extractor_linux.LinuxCodeExtractor() + commits, failures = extractor.extract_commits_for_affected_entry(affected) + self.assertLen(failures, 1) + self.assertEqual( + failures[0].url, + 'https://unsupported.kernel.patch.source.com/blah' + ) + self.assertLen(commits, 5) + for commit in commits: + self.assertIsInstance(commit, git_commit.GitCommit) + self.assertEqual(commit._rev, "1234567") + + self.assertEqual( + commits[0]._remote, + 'https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git' + ) + self.assertEqual( + commits[1]._remote, + 'https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git' + ) + self.assertEqual( + commits[2]._remote, + 'https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git' + ) + self.assertEqual( + commits[3]._remote, + 'https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git' + ) + self.assertEqual( + commits[4]._remote, + 'https://github.com/torvalds/linux' + ) + + def test_extract_with_empty_patch(self): + affected = vulnerability.AffectedEntry({ + 'package': {'ecosystem': 'Linux', 'name': 'Kernel'}, + 'ecosystem_specific': {'fixes': []}, + }) + + extractor = code_extractor_linux.LinuxCodeExtractor() + commits, failures = extractor.extract_commits_for_affected_entry(affected) + self.assertEmpty(commits) + self.assertEmpty(failures) + +if __name__ == '__main__': + absltest.main() diff --git a/vanir/code_extractors/git_commit.py b/vanir/code_extractors/git_commit.py index 151c109..2a047fe 100644 --- a/vanir/code_extractors/git_commit.py +++ b/vanir/code_extractors/git_commit.py @@ -27,6 +27,15 @@ _GENERIC_URL_PATTERN = re.compile( r'(?P[^:]+://[^/]+/.+)/(?P[^/]+)' ) +# https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=382c27f4ed28f803b1f1473ac2d8db0afc795a1b +_LINUX_KERNEL_PATTERN = re.compile( + r'(?P[^:]+://git.kernel.org/pub/scm/linux/kernel/git/[^/]+/[^.]+.git)/commit/\?id=(?P[^/]+)' +) +# https://git.kernel.org/linus/1eff70a9abd46f175defafd29bc17ad456f398a7 +# https://git.kernel.org/stable/c/47f82395f04a976d4fa97de7f2acffa1c1096571 +_LINUX_KERNEL_PATTERN_SHORT = re.compile( + r'(?P[^:]+://git.kernel.org/)(?P[^/]+)/(c/)?(?P[^/]+)' +) @functools.cache @@ -36,11 +45,22 @@ def _parse_url(url: str) -> Tuple[str, str]: _NORMALIZED_URL_PATTERN, _GITILES_URL_PATTERN, _GITHUB_URL_PATTERN, + _LINUX_KERNEL_PATTERN, + _LINUX_KERNEL_PATTERN_SHORT, _GENERIC_URL_PATTERN, ): match = pattern.fullmatch(url) if match: - return (match.group('remote'), match.group('rev')) + if pattern == _LINUX_KERNEL_PATTERN_SHORT: + # the shortened url does not contain the complete remote + if match.group('name') == 'linus': + remote = 'https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git' + return (remote, match.group('rev')) + elif match.group('name') == 'stable': + remote = 'https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git' + return (remote, match.group('rev')) + else: + return (match.group('remote'), match.group('rev')) raise code_extractor_base.IncompatibleUrlError(f'Unrecognized git URL: {url}') From efc173581dd8bad6798550889b79118ea3b5110a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yannik=20Tannh=C3=A4user?= Date: Wed, 17 Dec 2025 15:59:14 +0100 Subject: [PATCH 2/4] Add support for malformed CVE files In order to support osv.dev as vulnerability file source, fix urls are added from the "References" field where the vulnerability file does not offer this information in the "Affected" OSV entry. --- vanir/vulnerability.py | 70 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 68 insertions(+), 2 deletions(-) diff --git a/vanir/vulnerability.py b/vanir/vulnerability.py index eed28e4..1303145 100644 --- a/vanir/vulnerability.py +++ b/vanir/vulnerability.py @@ -16,7 +16,7 @@ from typing import Any, Dict, List, Mapping, Optional, Sequence, Union from google.protobuf import json_format from osv import vulnerability_pb2 - +from absl import logging from vanir import signature OSV_AFFECTED = 'affected' @@ -228,7 +228,73 @@ def __init__( # Convert all affected entries to AffectedEntry objects. self.affected = [] - for osv_affected in osv_vuln.get(OSV_AFFECTED, []): + + # Collect information about signatures and fixes + contains_signatures = False + contains_fixes = False + osv_affected_entries = osv_vuln.get(OSV_AFFECTED, []) + for osv_affected in osv_affected_entries: + for signature_location in (OSV_ECOSYSTEM_SPECIFIC, OSV_DATABASE_SPECIFIC): + if ( + signature_location in osv_affected + and 'vanir_signatures' in osv_affected[signature_location] + ): + contains_signatures = True + if ( + OSV_ECOSYSTEM_SPECIFIC in osv_affected + and 'fixes' in osv_affected[OSV_ECOSYSTEM_SPECIFIC] + ): + contains_fixes = True + + # Extract potentially useful patch information from references field if + # affected field is not present or existing affected fields do not contain fixes + if not contains_signatures and self.references is not None: + from vanir.code_extractors.git_commit import _LINUX_KERNEL_PATTERN + from vanir.code_extractors.git_commit import _LINUX_KERNEL_PATTERN_SHORT + if len(osv_affected_entries) == 0: + logging.debug('The osv vulnerability does not contain any affected field') + # No affected entries, so we use the url to deduce the ecosystem + ecosystem = 'unknown' + for reference in self.references: + # Problem of missing affected entries mainly exists for Linux + if ( + _LINUX_KERNEL_PATTERN.fullmatch(reference.get('url')) + or _LINUX_KERNEL_PATTERN_SHORT.fullmatch(reference.get('url')) + ): + ecosystem = 'Linux' + break + if ecosystem != 'unknown': + dummy_entry = { + 'package': {'name': 'dummy_entry', 'ecosystem': ecosystem} + } + osv_affected_entries.append(dummy_entry) + if not contains_fixes and len(osv_affected_entries) > 0: + # OSV vulnerability contains no fixes. Remove all affected_entries except for the first one and add references. + # Entries without fixes are not useful for signature creation anyway and we do not want multiple entries + # with the same signatures. + # Without these modifications we could not use this vulnerability at all. + logging.debug( + 'The osv vulnerability does not contain a fixes field. Trying to use data from references field instead.' + ) + fix_urls = [] + for reference in self.references: + # Some urls don't have the right type (FIX') + if ( + reference.get('type', []) == 'FIX' + or _LINUX_KERNEL_PATTERN.fullmatch(reference.get('url', '')) + or _LINUX_KERNEL_PATTERN_SHORT.fullmatch(reference.get('url', '')) + ): + fix_urls.append(reference.get('url')) + if fix_urls: + affected_entry = osv_affected_entries[0] + if not OSV_ECOSYSTEM_SPECIFIC in affected_entry: + affected_entry[OSV_ECOSYSTEM_SPECIFIC] = {} + affected_entry[OSV_ECOSYSTEM_SPECIFIC]['fixes'] = fix_urls + osv_affected_entries = [affected_entry] + else: + osv_affected_entries = osv_vuln.get(OSV_AFFECTED, []) + + for osv_affected in osv_affected_entries: self.affected.append( AffectedEntry(osv_affected, store_signatures_in_legacy_location) ) From f0d77c31e74206bb65891651d72320814fa469d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yannik=20Tannh=C3=A4user?= Date: Tue, 24 Mar 2026 14:19:51 +0100 Subject: [PATCH 3/4] Small improvements Fix typo, add log messages and improve memory consumption. --- vanir/vulnerability_manager.py | 20 +++++++++++++++++++- vanir/vulnerability_manager_test.py | 6 ++++-- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/vanir/vulnerability_manager.py b/vanir/vulnerability_manager.py index 48775c9..af8f4bf 100644 --- a/vanir/vulnerability_manager.py +++ b/vanir/vulnerability_manager.py @@ -303,7 +303,7 @@ class SignatureFilter(VulnerabilityFilter): def _should_filter_out(self, sign: signature.Signature, context_package: vulnerability.AffectedEntry, context_vul: vulnerability.Vulnerability) -> bool: - """Decides if the given |signautre| should be filtered out or not. + """Decides if the given |signature| should be filtered out or not. Args: sign: the signature to be tested. @@ -540,6 +540,9 @@ def generate_signatures( # Mapping from package -> list of patchsets for each affected version. patch_series = collections.defaultdict(list) for i, affected_entry in enumerate(vuln.affected): + if len(affected_entry.vanir_signatures) > 0: + logging.info('Affected entry already contains vanir signatures. Skipping.') + continue try: commits, failed_urls = code_extractor.extract_for_affected_entry( affected_entry, session @@ -556,6 +559,10 @@ def generate_signatures( osv_id, failed_url.url, failed_url.error, ) if not commits: + logging.debug( + 'Could not extract any commits for %s in affected entry %s. Skipping.', + osv_id, i, + ) continue patch_series[ @@ -659,6 +666,17 @@ def generate_signatures( affected_entry.vanir_signatures = list(refined_signatures) affected_entry.sort_vanir_signatures() + # Remove already consumed tmp files to free up some space. When processing + # many CVEs, /tmp will fill up too much otherwise. + for (_,_), patchsets in patch_series.items(): + for commits in patchsets: + for commit in commits: + if hasattr(commit, '_working_root_dir_obj'): + del commit._working_root_dir_obj + if hasattr(commit, '_working_dir'): + del commit._working_dir + del patch_series + @property def affected_package_names(self) -> Collection[str]: """All affected package names of vulnerabilities in this manager. diff --git a/vanir/vulnerability_manager_test.py b/vanir/vulnerability_manager_test.py index b88bbca..21614be 100644 --- a/vanir/vulnerability_manager_test.py +++ b/vanir/vulnerability_manager_test.py @@ -518,10 +518,12 @@ def test_vulnerability_manager_generate_signatures(self): self._test_osv_vuln ) - def test_vulnerability_manager_generate_signature_fails_when_sig_exists(self): + def test_vulnerability_manager_generate_signature_logs_message_when_sig_exists(self): manager = vulnerability_manager.VulnerabilityManager([self._test_osv_vuln]) - with self.assertRaisesRegex(ValueError, '.*already exists.*'): + expected_log = 'INFO:absl:Affected entry already contains vanir signatures. Skipping.' + with self.assertLogs() as logger: manager.generate_signatures() + self.assertIn(expected_log, logger.output) self.assertLen(manager.vulnerabilities, 1) self.assertEqual( manager.vulnerabilities[0].to_osv_dict(), From ad0a06fd403a46b76381432421f89fd4a0bf4202 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yannik=20Tannh=C3=A4user?= Date: Thu, 18 Dec 2025 16:46:07 +0100 Subject: [PATCH 4/4] Fix shortened commit hash issues The "git fetch " command is not working for shortened commit hashes due to ambiguity. In case git does not recognize the revision, fetch all sources. This fixes the issue where patches can not be loaded due to shortened hashes in the CVE file. --- vanir/code_extractors/git_commit.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/vanir/code_extractors/git_commit.py b/vanir/code_extractors/git_commit.py index 2a047fe..0bdfb78 100644 --- a/vanir/code_extractors/git_commit.py +++ b/vanir/code_extractors/git_commit.py @@ -140,7 +140,15 @@ def __init__( self._run_git(['config', '--add', 'gc.auto', '0']) for src, dest in git_instead_ofs: self._run_git(['config', '--add', f'url.{dest}.insteadOf', src]) - self._fetch() + if self._rev_valid(): + self._fetch() + else: + # possible failure reason is a shortened commit hash + # which can be fixed by fetching all revisions + logging.info( + 'Revision is not valid. Is the commit hash shortened? Fetching all sources.' + ) + self._fetch_all() parents = self._run_git( ['rev-parse', f'{self._rev}^@'] ).decode('utf-8').strip().split() @@ -158,6 +166,22 @@ def _fetch(self): self._remote, self._rev, ]) + def _fetch_all(self): + return self._run_git_with_retry([ + 'fetch', '--quiet', '--no-tags', + self._remote, + ]) + + def _rev_valid(self) -> bool: + try: + self._run_git( + ['rev-parse', f'{self._rev}'] + ) + except code_extractor_base.CommitDataFetchError as e: + return False + else: + return True + def _normalize_url(self) -> str: # Validation is already done in __init__(), inside _parse_url(). if 'github.com' in self._remote: