Skip to content

Commit 045c4c5

Browse files
committed
feat: Cloud Run Job deployment for full 6-year Common Crawl import
- Expanded domain list to 60+ medical + CS domains with categorized tagging - Cloud Run Job config: 10 parallel tasks, 100 segments per crawl - Multi-crawl orchestrator for 14 quarterly snapshots (2020-2026) - Enhanced generateTags with domain-specific labels for oncology, dermatology, ML conferences, research labs, and academic institutions - Target: 375K-500K medical/CS pages over 5 months Co-Authored-By: claude-flow <ruv@ruv.net>
1 parent 14ab7b0 commit 045c4c5

3 files changed

Lines changed: 196 additions & 4 deletions

File tree

scripts/deploy-wet-job.sh

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
#!/bin/bash
2+
# Deploy WET processor as Cloud Run Job for large-scale Common Crawl import
3+
# Usage: ./deploy-wet-job.sh [PROJECT] [CRAWL_INDEX] [START_SEGMENT] [NUM_SEGMENTS]
4+
set -euo pipefail
5+
6+
PROJECT="${1:-ruv-dev}"
7+
CRAWL_INDEX="${2:-CC-MAIN-2026-08}"
8+
START_SEG="${3:-0}"
9+
NUM_SEGS="${4:-100}"
10+
REGION="us-central1"
11+
JOB_NAME="wet-import-$(echo $CRAWL_INDEX | tr '[:upper:]' '[:lower:]' | tr -d '-' | tail -c 8)"
12+
13+
echo "=== WET Cloud Run Job Deployment ==="
14+
echo "Project: $PROJECT"
15+
echo "Crawl: $CRAWL_INDEX"
16+
echo "Segments: $START_SEG to $((START_SEG + NUM_SEGS - 1))"
17+
echo "Job name: $JOB_NAME"
18+
echo ""
19+
20+
# First, upload the filter script to GCS so the job can access it
21+
echo "--- Uploading filter script to GCS ---"
22+
gsutil cp scripts/wet-filter-inject.js gs://ruvector-brain-dev/scripts/wet-filter-inject.js 2>&1
23+
24+
# Get the WET paths file
25+
echo "--- Fetching WET paths ---"
26+
PATHS_URL="https://data.commoncrawl.org/crawl-data/${CRAWL_INDEX}/wet.paths.gz"
27+
curl -sL "$PATHS_URL" | gunzip | sed -n "$((START_SEG + 1)),$((START_SEG + NUM_SEGS))p" > /tmp/wet-paths-batch.txt
28+
ACTUAL_SEGS=$(wc -l < /tmp/wet-paths-batch.txt)
29+
echo "Segments to process: $ACTUAL_SEGS"
30+
31+
# Upload paths file
32+
gsutil cp /tmp/wet-paths-batch.txt gs://ruvector-brain-dev/scripts/wet-paths-batch.txt 2>&1
33+
34+
# Build the domain list for the job command
35+
DOMAIN_LIST="pubmed.ncbi.nlm.nih.gov,ncbi.nlm.nih.gov,who.int,cancer.org,aad.org,dermnetnz.org,melanoma.org,arxiv.org,acm.org,ieee.org,nature.com,nejm.org,bmj.com,mayoclinic.org,clevelandclinic.org,medlineplus.gov,cdc.gov,nih.gov,thelancet.com,sciencedirect.com,webmd.com,healthline.com,medscape.com,jamanetwork.com,frontiersin.org,plos.org,biomedcentral.com,cell.com,springer.com,cochrane.org,clinicaltrials.gov,fda.gov,mskcc.org,mdanderson.org,nccn.org,dl.acm.org,ieeexplore.ieee.org,proceedings.neurips.cc,huggingface.co,pytorch.org,tensorflow.org,cs.stanford.edu,deepmind.google,research.google,microsoft.com/research,openreview.net,paperswithcode.com,asco.org,esmo.org,dana-farber.org,cancer.net,uptodate.com,wiley.com,elsevier.com,mdpi.com,plos.org,aaai.org,usenix.org,jmlr.org,aclanthology.org"
36+
37+
# Create/update the Cloud Run Job
38+
echo "--- Creating Cloud Run Job ---"
39+
gcloud run jobs create "$JOB_NAME" \
40+
--project="$PROJECT" \
41+
--region="$REGION" \
42+
--image="node:20-alpine" \
43+
--command="/bin/sh" \
44+
--args="-c,apk add --no-cache curl bash > /dev/null 2>&1 && gsutil cp gs://ruvector-brain-dev/scripts/wet-filter-inject.js /tmp/filter.js 2>/dev/null && WET_PATH=\$(gsutil cat gs://ruvector-brain-dev/scripts/wet-paths-batch.txt 2>/dev/null | sed -n \"\${CLOUD_RUN_TASK_INDEX:-0}p\" | head -1) && echo \"Processing: \$WET_PATH\" && curl -sL \"https://data.commoncrawl.org/\$WET_PATH\" | gunzip | node /tmp/filter.js --brain-url https://pi.ruv.io --auth 'Authorization: Bearer ruvector-crawl-2026' --batch-size 10 --crawl-index $CRAWL_INDEX --domains '$DOMAIN_LIST'" \
45+
--task-count="$ACTUAL_SEGS" \
46+
--parallelism=10 \
47+
--max-retries=1 \
48+
--cpu=1 \
49+
--memory=1Gi \
50+
--task-timeout=3600s \
51+
--set-env-vars="CRAWL_INDEX=$CRAWL_INDEX" \
52+
2>&1 || \
53+
gcloud run jobs update "$JOB_NAME" \
54+
--project="$PROJECT" \
55+
--region="$REGION" \
56+
--task-count="$ACTUAL_SEGS" \
57+
--parallelism=10 \
58+
2>&1
59+
60+
echo ""
61+
echo "--- Job created. To execute: ---"
62+
echo "gcloud run jobs execute $JOB_NAME --project=$PROJECT --region=$REGION"
63+
echo ""
64+
echo "--- To monitor: ---"
65+
echo "gcloud run jobs executions list --job=$JOB_NAME --project=$PROJECT --region=$REGION"

scripts/wet-filter-inject.js

Lines changed: 71 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,51 @@ const MAX_CONTENT_LENGTH = 8000;
2020
const stats = { total: 0, filtered: 0, injected: 0, errors: 0, batched: 0 };
2121
let batch = [];
2222

23+
// Default domain list: 60+ medical + CS domains
24+
const DEFAULT_DOMAINS = [
25+
// Medical - Major Publishers & Journals
26+
'pubmed.ncbi.nlm.nih.gov', 'ncbi.nlm.nih.gov', 'who.int',
27+
'nature.com', 'nejm.org', 'bmj.com', 'thelancet.com',
28+
'jamanetwork.com', 'annals.org', 'sciencedirect.com',
29+
// Medical - Clinical Resources
30+
'mayoclinic.org', 'clevelandclinic.org', 'medlineplus.gov',
31+
'cdc.gov', 'nih.gov', 'webmd.com', 'healthline.com',
32+
'medscape.com', 'uptodate.com',
33+
// Medical - Oncology & Dermatology
34+
'cancer.org', 'aad.org', 'dermnetnz.org', 'melanoma.org',
35+
'asco.org', 'esmo.org', 'nccn.org', 'cancer.net',
36+
'mskcc.org', 'mdanderson.org', 'dana-farber.org',
37+
'dermcoll.edu.au', 'bad.org.uk', 'euroderm.org',
38+
'jaad.org', 'jidonline.org',
39+
// Medical - Publishers & Open Access
40+
'wiley.com', 'onlinelibrary.wiley.com', 'springer.com',
41+
'karger.com', 'thieme.com', 'mdpi.com', 'frontiersin.org',
42+
'plos.org', 'biomedcentral.com', 'cell.com', 'elsevier.com',
43+
// Medical - Regulatory & Evidence
44+
'clinicaltrials.gov', 'fda.gov', 'ema.europa.eu',
45+
'nice.org.uk', 'cochrane.org',
46+
'hopkinsmedicine.org', 'stanfordmedicine.org',
47+
// CS - Conferences & Journals
48+
'arxiv.org', 'acm.org', 'dl.acm.org', 'ieee.org',
49+
'ieeexplore.ieee.org', 'proceedings.neurips.cc',
50+
'aclanthology.org', 'jmlr.org', 'aaai.org', 'ijcai.org',
51+
'usenix.org', 'vldb.org', 'sigmod.org', 'icml.cc',
52+
'cvpr.thecvf.com', 'eccv.ecva.net', 'iccv.thecvf.com',
53+
'openreview.net', 'paperswithcode.com',
54+
// CS - Frameworks & Tools
55+
'huggingface.co', 'pytorch.org', 'tensorflow.org',
56+
'wandb.ai', 'mlflow.org', 'ray.io',
57+
'dmlc.cs.washington.edu',
58+
// CS - Research Labs & Universities
59+
'cs.stanford.edu', 'cs.berkeley.edu', 'cs.cmu.edu',
60+
'cs.mit.edu', 'deepmind.google', 'ai.meta.com',
61+
'research.google', 'microsoft.com/research',
62+
'blog.openai.com', 'anthropic.com',
63+
];
64+
2365
function matchesDomain(url) {
24-
return DOMAINS.some(d => url.includes(d));
66+
const allDomains = DOMAINS.length > 0 ? DOMAINS : DEFAULT_DOMAINS;
67+
return allDomains.some(d => url.includes(d));
2568
}
2669

2770
function extractTitle(content) {
@@ -38,12 +81,36 @@ function generateTags(url, content) {
3881
if (url.includes('pubmed') || url.includes('ncbi')) tags.push('pubmed', 'medical');
3982
else if (url.includes('arxiv')) tags.push('arxiv', 'research');
4083
else if (url.includes('who.int')) tags.push('who', 'global-health');
41-
else if (url.includes('cancer.org')) tags.push('cancer', 'oncology');
42-
else if (url.includes('dermnetnz') || url.includes('aad.org')) tags.push('dermatology');
84+
else if (url.includes('cancer.org') || url.includes('cancer.net') || url.includes('nccn.org')) tags.push('cancer', 'oncology');
85+
else if (url.includes('asco.org') || url.includes('esmo.org')) tags.push('oncology', 'clinical');
86+
else if (url.includes('mskcc.org') || url.includes('mdanderson.org') || url.includes('dana-farber.org')) tags.push('oncology', 'research');
87+
else if (url.includes('dermnetnz') || url.includes('aad.org') || url.includes('jaad.org')) tags.push('dermatology');
88+
else if (url.includes('dermcoll') || url.includes('bad.org.uk') || url.includes('euroderm')) tags.push('dermatology');
89+
else if (url.includes('jidonline')) tags.push('dermatology', 'research');
4390
else if (url.includes('melanoma')) tags.push('melanoma', 'skin-cancer');
44-
else if (url.includes('acm.org') || url.includes('ieee')) tags.push('computer-science');
91+
else if (url.includes('clinicaltrials.gov')) tags.push('clinical-trials', 'medical');
92+
else if (url.includes('fda.gov') || url.includes('ema.europa.eu')) tags.push('regulatory', 'medical');
93+
else if (url.includes('nice.org.uk') || url.includes('cochrane.org')) tags.push('evidence-based', 'medical');
94+
else if (url.includes('hopkinsmedicine') || url.includes('stanfordmedicine')) tags.push('medical', 'academic');
95+
else if (url.includes('webmd') || url.includes('healthline') || url.includes('medscape')) tags.push('medical', 'clinical');
96+
else if (url.includes('uptodate.com')) tags.push('medical', 'clinical-decision');
97+
else if (url.includes('acm.org') || url.includes('ieee') || url.includes('dl.acm.org')) tags.push('computer-science');
98+
else if (url.includes('neurips') || url.includes('icml') || url.includes('aaai.org')) tags.push('ml', 'conference');
99+
else if (url.includes('cvpr') || url.includes('eccv') || url.includes('iccv')) tags.push('computer-vision', 'conference');
100+
else if (url.includes('aclanthology')) tags.push('nlp', 'conference');
101+
else if (url.includes('usenix') || url.includes('vldb') || url.includes('sigmod')) tags.push('systems', 'conference');
102+
else if (url.includes('huggingface') || url.includes('pytorch') || url.includes('tensorflow')) tags.push('ml', 'framework');
103+
else if (url.includes('deepmind') || url.includes('ai.meta') || url.includes('research.google')) tags.push('ml', 'research-lab');
104+
else if (url.includes('openai') || url.includes('anthropic')) tags.push('ml', 'research-lab');
105+
else if (url.includes('cs.stanford') || url.includes('cs.berkeley') || url.includes('cs.cmu') || url.includes('cs.mit')) tags.push('computer-science', 'academic');
106+
else if (url.includes('openreview') || url.includes('paperswithcode')) tags.push('ml', 'research');
45107
else if (url.includes('github') || url.includes('stackoverflow')) tags.push('programming');
46108
else if (url.includes('nature.com') || url.includes('nejm') || url.includes('lancet')) tags.push('journal', 'research');
109+
else if (url.includes('jamanetwork') || url.includes('annals.org') || url.includes('bmj.com')) tags.push('journal', 'medical');
110+
else if (url.includes('frontiersin') || url.includes('plos.org') || url.includes('biomedcentral')) tags.push('open-access', 'research');
111+
else if (url.includes('cell.com') || url.includes('elsevier') || url.includes('springer') || url.includes('wiley')) tags.push('journal', 'publisher');
112+
else if (url.includes('mdpi.com') || url.includes('karger') || url.includes('thieme')) tags.push('journal', 'publisher');
113+
else if (url.includes('jmlr.org') || url.includes('ijcai.org')) tags.push('ml', 'journal');
47114

48115
const lower = content.toLowerCase();
49116
if (lower.includes('melanoma')) tags.push('melanoma');

scripts/wet-full-import.sh

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
#!/bin/bash
2+
# Full 6-year medical + CS import via WET processing
3+
# Processes quarterly Common Crawl snapshots from 2020-2026
4+
set -euo pipefail
5+
6+
PROJECT="${1:-ruv-dev}"
7+
SEGS_PER_CRAWL="${2:-100}" # segments per crawl to process
8+
9+
# Quarterly crawl indices (2020-2026)
10+
CRAWLS=(
11+
"CC-MAIN-2020-16"
12+
"CC-MAIN-2020-50"
13+
"CC-MAIN-2021-17"
14+
"CC-MAIN-2021-43"
15+
"CC-MAIN-2022-05"
16+
"CC-MAIN-2022-33"
17+
"CC-MAIN-2023-06"
18+
"CC-MAIN-2023-40"
19+
"CC-MAIN-2024-10"
20+
"CC-MAIN-2024-42"
21+
"CC-MAIN-2025-13"
22+
"CC-MAIN-2025-40"
23+
"CC-MAIN-2026-06"
24+
"CC-MAIN-2026-08"
25+
)
26+
27+
BRAIN_URL="https://pi.ruv.io"
28+
29+
echo "=== Full 6-Year Medical + CS Import ==="
30+
echo "Crawls: ${#CRAWLS[@]}"
31+
echo "Segments per crawl: $SEGS_PER_CRAWL"
32+
echo "Total segments: $((${#CRAWLS[@]} * SEGS_PER_CRAWL))"
33+
echo ""
34+
35+
BEFORE=$(curl -s "$BRAIN_URL/v1/status" \
36+
| python3 -c "import sys,json; print(json.load(sys.stdin)['total_memories'])" 2>/dev/null || echo "0")
37+
echo "Brain memories before: $BEFORE"
38+
echo ""
39+
40+
for crawl in "${CRAWLS[@]}"; do
41+
echo "=== Deploying job for $crawl ==="
42+
bash scripts/deploy-wet-job.sh "$PROJECT" "$crawl" 0 "$SEGS_PER_CRAWL"
43+
44+
# Execute the job
45+
JOB_NAME="wet-import-$(echo $crawl | tr '[:upper:]' '[:lower:]' | tr -d '-' | tail -c 8)"
46+
gcloud run jobs execute "$JOB_NAME" --project="$PROJECT" --region=us-central1 --async 2>&1
47+
48+
echo "Job $JOB_NAME submitted (async)"
49+
echo ""
50+
51+
# Don't flood -- wait 30s between job submissions
52+
sleep 30
53+
done
54+
55+
echo ""
56+
echo "=== All jobs submitted ==="
57+
echo "Monitor with: gcloud run jobs executions list --project=$PROJECT --region=us-central1"
58+
echo ""
59+
echo "Check brain growth:"
60+
echo " curl -s $BRAIN_URL/v1/status | python3 -c \"import sys,json; d=json.load(sys.stdin); print(f'Memories: {d[\\\"total_memories\\\"]}')\""

0 commit comments

Comments
 (0)