Skip to content

Commit ade1ee0

Browse files
committed
fix: WET processor OOM — process records inline, increase memory to 2Gi
Node.js heap exhausted at 512MB buffering 21K WARC records. Fix: process each record immediately instead of accumulating in pendingRecords array. Also cap per-record content length and increase Cloud Run Job memory from 1Gi to 2Gi with --max-old-space-size=1536. Co-Authored-By: claude-flow <ruv@ruv.net>
1 parent 11c6cdf commit ade1ee0

2 files changed

Lines changed: 16 additions & 11 deletions

File tree

scripts/deploy-wet-job.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ echo "Crawl index: $CRAWL_INDEX"
6262
6363
curl -sL "https://data.commoncrawl.org/$WET_PATH" \
6464
| gunzip \
65-
| node /app/filter.js \
65+
| node --max-old-space-size=1536 /app/filter.js \
6666
--brain-url "$BRAIN_URL" \
6767
--auth "$AUTH_HEADER" \
6868
--batch-size "$BATCH_SIZE" \
@@ -92,7 +92,7 @@ gcloud run jobs deploy "$JOB_NAME" \
9292
--parallelism=10 \
9393
--max-retries=1 \
9494
--cpu=1 \
95-
--memory=1Gi \
95+
--memory=2Gi \
9696
--task-timeout=3600s \
9797
--env-vars-file="$BUILD_DIR/env.yaml" \
9898
2>&1

scripts/wet-filter-inject.js

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -187,12 +187,16 @@ const rl = readline.createInterface({ input: process.stdin, crlfDelay: Infinity
187187
let recordUrl = '';
188188
let recordContent = '';
189189
let inRecord = false;
190-
const pendingRecords = [];
190+
// Process records inline to avoid OOM — never buffer all records
191+
let processQueue = Promise.resolve();
191192

192193
rl.on('line', (line) => {
193194
if (line.startsWith('WARC/1.0')) {
194195
if (recordUrl && recordContent) {
195-
pendingRecords.push({ url: recordUrl, content: recordContent });
196+
// Process immediately, don't accumulate
197+
const url = recordUrl;
198+
const content = recordContent;
199+
processQueue = processQueue.then(() => processRecord(url, content));
196200
}
197201
recordUrl = '';
198202
recordContent = '';
@@ -202,19 +206,20 @@ rl.on('line', (line) => {
202206
} else if (line.startsWith('Content-Length:')) {
203207
inRecord = true;
204208
} else if (inRecord) {
205-
recordContent += line + '\n';
209+
// Limit content accumulation per record to prevent single-record bloat
210+
if (recordContent.length < MAX_CONTENT_LENGTH * 2) {
211+
recordContent += line + '\n';
212+
}
206213
}
207214
});
208215

209216
rl.on('close', async () => {
210217
// Process last record
211218
if (recordUrl && recordContent) {
212-
pendingRecords.push({ url: recordUrl, content: recordContent });
213-
}
214-
215-
// Process all records sequentially
216-
for (const rec of pendingRecords) {
217-
await processRecord(rec.url, rec.content);
219+
await processQueue;
220+
await processRecord(recordUrl, recordContent);
221+
} else {
222+
await processQueue;
218223
}
219224

220225
// Flush remaining batch

0 commit comments

Comments
 (0)