Skip to content

Commit 7b66986

Browse files
committed
Add local SHA1 resolution
1 parent 89f5a25 commit 7b66986

2 files changed

Lines changed: 155 additions & 4 deletions

File tree

README.md

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,10 +62,16 @@ python extract_structural_entities.py
6262
```
6363

6464
**Phase 3b: Identity Resolution**
65-
Resolve Git-compatible hashes to standard SHA1 via the Software Heritage API:
66-
```bash
67-
python resolve_swh_hashes.py
68-
```
65+
Resolve Git-compatible hashes to standard SHA1. You can do this either locally (fast) or via the Software Heritage API (official):
66+
67+
* **Option A: Local Resolution (Recommended)**
68+
```bash
69+
python resolve_swh_hashes_local.py
70+
```
71+
* **Option B: API-based Resolution**
72+
```bash
73+
python resolve_swh_hashes.py
74+
```
6975

7076
### Phase 4: Indexing (Remote/GPU)
7177
Move `raw_functions.json` to a GPU-equipped environment to compute neural vectors and build the FAISS index.
Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
# This file is part of MediaWiki Code2Code Search
2+
# <https://github.com/ftosoni/mediawiki-code2code-search>.
3+
# Copyright (c) 2026 Francesco Tosoni.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
import os
18+
import json
19+
import hashlib
20+
import threading
21+
import time
22+
from concurrent.futures import ThreadPoolExecutor, as_completed
23+
from pathlib import Path
24+
25+
# Paths relative to this script
26+
PREPROC_DIR = os.path.dirname(os.path.abspath(__file__))
27+
# Input from Phase 3a
28+
UNRESOLVED_METADATA_PATH = os.path.join(PREPROC_DIR, "raw_metadata_unresolved.json")
29+
# Output for Phase 4 (Indexing)
30+
FINAL_METADATA_PATH = os.path.join(PREPROC_DIR, "..", "backend", "raw_functions.json")
31+
# Local repo root
32+
LOCAL_REPOS_ROOT = "C:\\Users\\franc\\Documents\\GitHub\\code-search-engine\\preprocessing\\mediawiki_repos" # os.path.join(PREPROC_DIR, "mediawiki_repos")
33+
34+
# Try to use swh.model for high-precision local hashing
35+
try:
36+
from swh.model.hashutil import MultiHash
37+
from swh.model.ident import compute_identifier, ObjectType
38+
SWH_MODEL_AVAILABLE = True
39+
except ImportError:
40+
SWH_MODEL_AVAILABLE = False
41+
42+
def get_file_hashes(filepath: str):
43+
"""Calculate both standard SHA1 and Git-compatible SHA1 (sha1_git) locally."""
44+
try:
45+
with open(filepath, "rb") as f:
46+
content = f.read().replace(b"\r\n", b"\n")
47+
48+
if SWH_MODEL_AVAILABLE:
49+
# Use official swh.model
50+
hashes = MultiHash.from_data(content).digest()
51+
return {
52+
"sha1": hashes["sha1"].hex(),
53+
"sha1_git": hashes["sha1_git"].hex()
54+
}
55+
else:
56+
# Fallback to standard hashlib
57+
sha1 = hashlib.sha1(content).hexdigest()
58+
header = f"blob {len(content)}\0".encode()
59+
sha1_git = hashlib.sha1(header + content).hexdigest()
60+
return {
61+
"sha1": sha1,
62+
"sha1_git": sha1_git
63+
}
64+
except Exception:
65+
return None
66+
67+
def resolve_hashes():
68+
print("== Phase 3b: Local Identity Resolution (API-less) ==")
69+
70+
if not os.path.exists(UNRESOLVED_METADATA_PATH):
71+
print(f"Error: {UNRESOLVED_METADATA_PATH} not found. Run extract_structural_entities.py first.")
72+
return
73+
74+
print("Loading unresolved metadata...")
75+
with open(UNRESOLVED_METADATA_PATH, "r", encoding="utf-8") as f:
76+
entities = json.load(f)
77+
78+
# Identify unique files to process
79+
unique_files = {} # (repo_group, repo_name, filepath) -> info
80+
for ent in entities:
81+
key = (ent["repo_group"], ent["repo_name"], ent["filepath"])
82+
if key not in unique_files:
83+
unique_files[key] = {
84+
"full_path": os.path.join(LOCAL_REPOS_ROOT, ent["repo_group"], ent["repo_name"], ent["filepath"])
85+
}
86+
87+
total_files = len(unique_files)
88+
print(f"Found {len(entities)} entities across {total_files} unique files.")
89+
print("Resolving hashes locally (using multiple threads for I/O)...")
90+
91+
resolved_cache = {} # key -> {sha1, sha1_git}
92+
start_time = time.time()
93+
94+
max_workers = os.cpu_count() * 2 if os.cpu_count() else 8
95+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
96+
future_to_key = {
97+
executor.submit(get_file_hashes, info["full_path"]): key
98+
for key, info in unique_files.items()
99+
}
100+
101+
completed = 0
102+
for future in as_completed(future_to_key):
103+
key = future_to_key[future]
104+
try:
105+
res = future.result()
106+
if res:
107+
resolved_cache[key] = res
108+
except Exception:
109+
pass
110+
111+
completed += 1
112+
if completed % 100 == 0 or completed == total_files:
113+
elapsed = time.time() - start_time
114+
print(f"Progress: {completed}/{total_files} files resolved ({(completed/total_files)*100:.1f}%)")
115+
116+
# Map resolved hashes back to entities and perform cleanup
117+
print("\nMapping resolved hashes back to entities and generating SWHIDs...")
118+
final_list = []
119+
for ent in entities:
120+
key = (ent["repo_group"], ent["repo_name"], ent["filepath"])
121+
res = resolved_cache.get(key)
122+
123+
if res:
124+
ent["sha1"] = res["sha1"]
125+
# Standard SWHID format: swh:1:cnt:<sha1_git>;origin=...;lines=...
126+
sha1_git = res["sha1_git"]
127+
ent["swhid"] = f"swh:1:cnt:{sha1_git};origin={ent['swh_origin']};lines={ent['start_line']}-{ent['end_line']}/"
128+
129+
# Remove phase-3a specific fields
130+
if "swhid_hash" in ent: del ent["swhid_hash"]
131+
if "swh_origin" in ent: del ent["swh_origin"]
132+
133+
final_list.append(ent)
134+
135+
print(f"Final mapping complete. {len(final_list)} entities ready.")
136+
137+
os.makedirs(os.path.dirname(FINAL_METADATA_PATH), exist_ok=True)
138+
with open(FINAL_METADATA_PATH, "w", encoding="utf-8") as f:
139+
json.dump(final_list, f, indent=2)
140+
141+
print(f"Final metadata saved to {FINAL_METADATA_PATH}")
142+
print(f"✅ Resolution complete in {time.time() - start_time:.1f}s. Ready for Phase 4: Vector Indexing.")
143+
144+
if __name__ == "__main__":
145+
resolve_hashes()

0 commit comments

Comments
 (0)