From 6d583cd2bb6e2bbc6a7031a2d53aa570db299dd6 Mon Sep 17 00:00:00 2001 From: Aniket Date: Sun, 15 Feb 2026 15:51:19 +0530 Subject: [PATCH 1/4] Create duplicate_issue_detector.yaml --- .../workflows/duplicate_issue_detector.yaml | 153 ++++++++++++++++++ 1 file changed, 153 insertions(+) create mode 100644 .github/workflows/duplicate_issue_detector.yaml diff --git a/.github/workflows/duplicate_issue_detector.yaml b/.github/workflows/duplicate_issue_detector.yaml new file mode 100644 index 000000000..4af456067 --- /dev/null +++ b/.github/workflows/duplicate_issue_detector.yaml @@ -0,0 +1,153 @@ +name: Smart Duplicate Issue Detector (Semantic) + +on: + issues: + types: [opened] + +permissions: + issues: write + +jobs: + detect-duplicates: + runs-on: ubuntu-latest + + steps: + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install dependencies + run: | + pip install --no-cache-dir sentence-transformers scikit-learn + + - name: Semantic duplicate detection (open + closed) + uses: actions/github-script@v6 + with: + script: | + const fs = require('fs'); + const issue = context.payload.issue; + + const issues = await github.paginate( + github.rest.issues.listForRepo, + { + owner: context.repo.owner, + repo: context.repo.repo, + state: 'all', + per_page: 100 + } + ); + + const data = { + current: { + number: issue.number, + title: issue.title, + body: issue.body || '' + }, + others: issues + .filter(i => i.number !== issue.number) + .map(i => ({ + number: i.number, + title: i.title, + body: i.body || '', + url: i.html_url, + state: i.state + })) + }; + + fs.writeFileSync('issues.json', JSON.stringify(data)); + + - name: Run semantic similarity analysis + run: | + python << 'EOF' + import json + from sentence_transformers import SentenceTransformer + from sklearn.metrics.pairwise import cosine_similarity + + THRESHOLD = 0.82 # good balance + MAX_RESULTS = 3 + + with open("issues.json") as f: + data = json.load(f) + + model = SentenceTransformer("all-MiniLM-L6-v2") + + def text(issue): + return f"{issue['title']} {issue['body']}".strip() + + current_text = text(data["current"]) + others = data["others"] + + embeddings = model.encode( + [current_text] + [text(i) for i in others], + normalize_embeddings=True + ) + + current_vec = embeddings[0] + other_vecs = embeddings[1:] + + sims = cosine_similarity([current_vec], other_vecs)[0] + + matches = [] + for issue, score in zip(others, sims): + if score >= THRESHOLD: + matches.append({ + "number": issue["number"], + "title": issue["title"], + "url": issue["url"], + "state": issue["state"], + "score": round(score * 100, 1) + }) + + matches = sorted(matches, key=lambda x: x["score"], reverse=True)[:MAX_RESULTS] + + with open("matches.json", "w") as f: + json.dump(matches, f) + EOF + + - name: Comment and label (non-blocking) + uses: actions/github-script@v6 + with: + script: | + const fs = require('fs'); + const matches = JSON.parse(fs.readFileSync('matches.json', 'utf8')); + + if (matches.length === 0) { + core.notice('No semantic duplicates found.'); + return; + } + + const list = matches.map( + (m, i) => + `${i + 1}. **${m.title}** (#${m.number}, ${m.state})\n` + + ` ${m.url}\n` + + ` Similarity: ${m.score}%` + ).join('\n\n'); + + const safe = async (fn) => { + try { await fn(); } catch { + core.notice('Skipped write action due to permissions'); + } + }; + + await safe(() => + github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.payload.issue.number, + body: + `⚠️ **Potential Duplicate Issue (Semantic Match)**\n\n` + + `This issue appears semantically similar to the following open or closed issues:\n\n` + + `${list}\n\n` + + `Please review before proceeding.` + }) + ); + + await safe(() => + github.rest.issues.addLabels({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.payload.issue.number, + labels: ['duplicate'] + }) + ); From a0cac45d9984d92283c94e626a0408d873ccfd9c Mon Sep 17 00:00:00 2001 From: Aniket Date: Sun, 15 Feb 2026 15:58:45 +0530 Subject: [PATCH 2/4] Update .github/workflows/duplicate_issue_detector.yaml Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- .github/workflows/duplicate_issue_detector.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/duplicate_issue_detector.yaml b/.github/workflows/duplicate_issue_detector.yaml index 4af456067..19e448069 100644 --- a/.github/workflows/duplicate_issue_detector.yaml +++ b/.github/workflows/duplicate_issue_detector.yaml @@ -96,7 +96,7 @@ jobs: "title": issue["title"], "url": issue["url"], "state": issue["state"], - "score": round(score * 100, 1) + "score": round(float(score) * 100, 1) }) matches = sorted(matches, key=lambda x: x["score"], reverse=True)[:MAX_RESULTS] From 497aeb89448a9fa68ccd43937a94a029aef26de8 Mon Sep 17 00:00:00 2001 From: Aniket Date: Sun, 15 Feb 2026 16:00:13 +0530 Subject: [PATCH 3/4] Code rabbit follow-up --- .../workflows/duplicate_issue_detector.yaml | 37 +++++++++++++++---- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/.github/workflows/duplicate_issue_detector.yaml b/.github/workflows/duplicate_issue_detector.yaml index 19e448069..924672b52 100644 --- a/.github/workflows/duplicate_issue_detector.yaml +++ b/.github/workflows/duplicate_issue_detector.yaml @@ -22,7 +22,7 @@ jobs: pip install --no-cache-dir sentence-transformers scikit-learn - name: Semantic duplicate detection (open + closed) - uses: actions/github-script@v6 + uses: actions/github-script@v7 with: script: | const fs = require('fs'); @@ -45,7 +45,7 @@ jobs: body: issue.body || '' }, others: issues - .filter(i => i.number !== issue.number) + .filter(i => i.number !== issue.number && !i.pull_request) .map(i => ({ number: i.number, title: i.title, @@ -64,7 +64,7 @@ jobs: from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity - THRESHOLD = 0.82 # good balance + THRESHOLD = 0.82 MAX_RESULTS = 3 with open("issues.json") as f: @@ -106,7 +106,7 @@ jobs: EOF - name: Comment and label (non-blocking) - uses: actions/github-script@v6 + uses: actions/github-script@v7 with: script: | const fs = require('fs'); @@ -125,8 +125,8 @@ jobs: ).join('\n\n'); const safe = async (fn) => { - try { await fn(); } catch { - core.notice('Skipped write action due to permissions'); + try { await fn(); } catch (e) { + core.notice(`Skipped write action: ${e.message}`); } }; @@ -143,11 +143,34 @@ jobs: }) ); + // Ensure softer label exists + const labelName = 'possible-duplicate'; + + try { + await github.rest.issues.getLabel({ + owner: context.repo.owner, + repo: context.repo.repo, + name: labelName + }); + } catch (e) { + if (e.status === 404) { + await safe(() => + github.rest.issues.createLabel({ + owner: context.repo.owner, + repo: context.repo.repo, + name: labelName, + color: 'FBCA04', + description: 'Potential semantic duplicate' + }) + ); + } + } + await safe(() => github.rest.issues.addLabels({ owner: context.repo.owner, repo: context.repo.repo, issue_number: context.payload.issue.number, - labels: ['duplicate'] + labels: [labelName] }) ); From e28a15adfb2915273d61ed94d032bc65553c136b Mon Sep 17 00:00:00 2001 From: Aniket Date: Sun, 15 Feb 2026 16:19:57 +0530 Subject: [PATCH 4/4] Update duplicate_issue_detector.yaml --- .../workflows/duplicate_issue_detector.yaml | 49 ++++++++++--------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/.github/workflows/duplicate_issue_detector.yaml b/.github/workflows/duplicate_issue_detector.yaml index 924672b52..54bd6b224 100644 --- a/.github/workflows/duplicate_issue_detector.yaml +++ b/.github/workflows/duplicate_issue_detector.yaml @@ -21,19 +21,19 @@ jobs: run: | pip install --no-cache-dir sentence-transformers scikit-learn - - name: Semantic duplicate detection (open + closed) + - name: Fetch upstream issues (AOSSIE-Org/PictoPy) uses: actions/github-script@v7 with: script: | const fs = require('fs'); const issue = context.payload.issue; - const issues = await github.paginate( + const upstreamIssues = await github.paginate( github.rest.issues.listForRepo, { - owner: context.repo.owner, - repo: context.repo.repo, - state: 'all', + owner: "AOSSIE-Org", + repo: "PictoPy", + state: "all", per_page: 100 } ); @@ -42,20 +42,20 @@ jobs: current: { number: issue.number, title: issue.title, - body: issue.body || '' + body: issue.body || "" }, - others: issues - .filter(i => i.number !== issue.number && !i.pull_request) + others: upstreamIssues + .filter(i => !i.pull_request) .map(i => ({ number: i.number, title: i.title, - body: i.body || '', + body: i.body || "", url: i.html_url, state: i.state })) }; - fs.writeFileSync('issues.json', JSON.stringify(data)); + fs.writeFileSync("issues.json", JSON.stringify(data)); - name: Run semantic similarity analysis run: | @@ -78,6 +78,11 @@ jobs: current_text = text(data["current"]) others = data["others"] + if not others: + with open("matches.json", "w") as f: + json.dump([], f) + exit() + embeddings = model.encode( [current_text] + [text(i) for i in others], normalize_embeddings=True @@ -105,15 +110,15 @@ jobs: json.dump(matches, f) EOF - - name: Comment and label (non-blocking) + - name: Comment and soft-label in fork (non-blocking) uses: actions/github-script@v7 with: script: | - const fs = require('fs'); - const matches = JSON.parse(fs.readFileSync('matches.json', 'utf8')); + const fs = require("fs"); + const matches = JSON.parse(fs.readFileSync("matches.json", "utf8")); if (matches.length === 0) { - core.notice('No semantic duplicates found.'); + core.notice("No semantic duplicates found."); return; } @@ -122,12 +127,11 @@ jobs: `${i + 1}. **${m.title}** (#${m.number}, ${m.state})\n` + ` ${m.url}\n` + ` Similarity: ${m.score}%` - ).join('\n\n'); + ).join("\n\n"); const safe = async (fn) => { - try { await fn(); } catch (e) { - core.notice(`Skipped write action: ${e.message}`); - } + try { await fn(); } + catch (e) { core.notice(`Skipped write action: ${e.message}`); } }; await safe(() => @@ -137,14 +141,13 @@ jobs: issue_number: context.payload.issue.number, body: `⚠️ **Potential Duplicate Issue (Semantic Match)**\n\n` + - `This issue appears semantically similar to the following open or closed issues:\n\n` + + `This issue appears semantically similar to the following issues in AOSSIE-Org/PictoPy:\n\n` + `${list}\n\n` + `Please review before proceeding.` }) ); - // Ensure softer label exists - const labelName = 'possible-duplicate'; + const labelName = "possible-duplicate"; try { await github.rest.issues.getLabel({ @@ -159,8 +162,8 @@ jobs: owner: context.repo.owner, repo: context.repo.repo, name: labelName, - color: 'FBCA04', - description: 'Potential semantic duplicate' + color: "FBCA04", + description: "Potential semantic duplicate (upstream comparison)" }) ); }