Skip to content

Commit ac6d444

Browse files
fix(Makefile): add targets iterate-remote, cdxj-remote, extract-remote. use github remote warc over https + EoT warcs over https and s3.
1 parent 7d99aee commit ac6d444

1 file changed

Lines changed: 24 additions & 26 deletions

File tree

Makefile

Lines changed: 24 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
GITHUB_WHIRLWIND_WARC_HTTPS = https://raw.githubusercontent.com/commoncrawl/whirlwind-python/refs/heads/main/whirlwind.warc.gz
2+
EOT_IA_WARC_HTTPS = https://eotarchive.s3.amazonaws.com/crawl-data/EOT-2024/segments/IA-000/warc/EOT24PRE-20240926172119-crawl804_EOT24PRE-20240926172119-00000.warc.gz
3+
EOT_IA_WARC_S3 = s3://eotarchive/crawl-data/EOT-2024/segments/IA-000/warc/EOT24PRE-20240926172119-crawl804_EOT24PRE-20240926172119-00000.warc.gz
4+
EOT_CC_WARC_HTTPS = https://eotarchive.s3.amazonaws.com/crawl-data/EOT-2024/segments/CC-000/warc/EOT-2024-REPACKAGE-CC-MAIN-2024-42-GOV-000000-001.warc.gz
5+
EOT_CC_WARC_S3 = s3://eotarchive/crawl-data/EOT-2024/segments/CC-000/warc/EOT-2024-REPACKAGE-CC-MAIN-2024-42-GOV-000000-001.warc.gz
6+
7+
18
venv:
29
@echo "making a venv in ~/venv/whirlwind"
310
mkdir -p ~/venv
@@ -22,46 +29,37 @@ iterate:
2229
python ./warcio-iterator.py whirlwind.warc.wat.gz
2330
@echo
2431

25-
#FIXME: Update s3 locations if moved to public bucket:
26-
iterate-remote-s3:
27-
@echo iterating over remote warcs over https:
28-
@echo
29-
@echo warc:
30-
python ./warcio-iterator.py s3://commoncrawl-dev/whirlwind-example-files/whirlwind.warc.gz
31-
@echo
32-
@echo wet:
33-
python ./warcio-iterator.py s3://commoncrawl-dev/whirlwind-example-files/whirlwind.warc.wet.gz
34-
@echo
35-
@echo wat:
36-
python ./warcio-iterator.py s3://commoncrawl-dev/whirlwind-example-files/whirlwind.warc.wat.gz
37-
38-
39-
#FIXME: We need the example files on public s3 bucket for this:
40-
#iterate-remote-https:
41-
# @echo iterating over remote warcs over https:
42-
# @echo
43-
# @echo warc:
44-
# python ./warcio-iterator.py https://data.commoncrawl.org/<HYPOTHETICAL-PREFIX>/whirlwind.warc.gz
45-
# @echo
46-
# @echo wet:
47-
# python ./warcio-iterator.py https://data.commoncrawl.org/<HYPOTHETICAL-PREFIX>/whirlwind.warc.wet.gz
48-
# @echo
49-
# @echo wat:
50-
# python ./warcio-iterator.py https://data.commoncrawl.org/<HYPOTHETICAL-PREFIX>/whirlwind.warc.wat.gz
32+
iterate-remote:
33+
@echo "iterating over the remote WARC file over HTTPS"
34+
python ./warcio-iterator.py $(GITHUB_WHIRLWIND_WARC_HTTPS)
5135

5236
cdxj:
5337
@echo "creating *.cdxj index files from the local warcs"
5438
cdxj-indexer whirlwind.warc.gz > whirlwind.warc.cdxj
5539
cdxj-indexer --records conversion whirlwind.warc.wet.gz > whirlwind.warc.wet.cdxj
5640
cdxj-indexer whirlwind.warc.wat.gz > whirlwind.warc.wat.cdxj
5741

42+
cdxj-remote:
43+
@echo "indexing End-of-Term 2024 Internet Archive WARC file over HTTPS (Filesize ~1GB, first 10 records are shown):"
44+
cdxj-indexer $(EOT_IA_WARC_HTTPS) 2>/dev/null | head -n 10
45+
@echo
46+
@echo "indexing End-of-Term 2024 Common Crawl WARC file over S3 (Filesize ~1GB, first 10 records are shown):"
47+
cdxj-indexer $(EOT_CC_WARC_S3) 2>/dev/null | head -n 10
48+
5849
extract:
5950
@echo "creating extraction.* from local warcs, the offset numbers are from the cdxj index"
6051
warcio extract --payload whirlwind.warc.gz 1023 > extraction.html
6152
warcio extract --payload whirlwind.warc.wet.gz 466 > extraction.txt
6253
warcio extract --payload whirlwind.warc.wat.gz 443 > extraction.json
6354
@echo "hint: python -m json.tool extraction.json"
6455

56+
extract-remote:
57+
@echo "extracting hpxml.nrel.gov record from End-of-Term 2024 Internet Archive warc over HTTPS (offset 50755 from cdxj index):"
58+
warcio extract $(EOT_IA_WARC_HTTPS) 50755
59+
@echo
60+
@echo "extracting before-you-ship.18f.gov record from End-of-Term 2024 Common Crawl warc over S3 (offset 18595 from cdxj index):"
61+
warcio extract $(EOT_CC_WARC_S3) 18595
62+
6563
cdx_toolkit:
6664
@echo demonstrate that we have this entry in the index
6765
cdxt --crawl CC-MAIN-2024-22 --from 20240518015810 --to 20240518015810 iter an.wikipedia.org/wiki/Escopete

0 commit comments

Comments
 (0)