1+ GITHUB_WHIRLWIND_WARC_HTTPS = https://raw.githubusercontent.com/commoncrawl/whirlwind-python/refs/heads/main/whirlwind.warc.gz
2+ EOT_IA_WARC_HTTPS = https://eotarchive.s3.amazonaws.com/crawl-data/EOT-2024/segments/IA-000/warc/EOT24PRE-20240926172119-crawl804_EOT24PRE-20240926172119-00000.warc.gz
3+ EOT_IA_WARC_S3 = s3://eotarchive/crawl-data/EOT-2024/segments/IA-000/warc/EOT24PRE-20240926172119-crawl804_EOT24PRE-20240926172119-00000.warc.gz
4+ EOT_CC_WARC_HTTPS = https://eotarchive.s3.amazonaws.com/crawl-data/EOT-2024/segments/CC-000/warc/EOT-2024-REPACKAGE-CC-MAIN-2024-42-GOV-000000-001.warc.gz
5+ EOT_CC_WARC_S3 = s3://eotarchive/crawl-data/EOT-2024/segments/CC-000/warc/EOT-2024-REPACKAGE-CC-MAIN-2024-42-GOV-000000-001.warc.gz
6+
7+
18venv :
29 @echo " making a venv in ~/venv/whirlwind"
310 mkdir -p ~ /venv
@@ -22,46 +29,37 @@ iterate:
2229 python ./warcio-iterator.py whirlwind.warc.wat.gz
2330 @echo
2431
25- # FIXME: Update s3 locations if moved to public bucket:
26- iterate-remote-s3 :
27- @echo iterating over remote warcs over https:
28- @echo
29- @echo warc:
30- python ./warcio-iterator.py s3://commoncrawl-dev/whirlwind-example-files/whirlwind.warc.gz
31- @echo
32- @echo wet:
33- python ./warcio-iterator.py s3://commoncrawl-dev/whirlwind-example-files/whirlwind.warc.wet.gz
34- @echo
35- @echo wat:
36- python ./warcio-iterator.py s3://commoncrawl-dev/whirlwind-example-files/whirlwind.warc.wat.gz
37-
38-
39- # FIXME: We need the example files on public s3 bucket for this:
40- # iterate-remote-https:
41- # @echo iterating over remote warcs over https:
42- # @echo
43- # @echo warc:
44- # python ./warcio-iterator.py https://data.commoncrawl.org/<HYPOTHETICAL-PREFIX>/whirlwind.warc.gz
45- # @echo
46- # @echo wet:
47- # python ./warcio-iterator.py https://data.commoncrawl.org/<HYPOTHETICAL-PREFIX>/whirlwind.warc.wet.gz
48- # @echo
49- # @echo wat:
50- # python ./warcio-iterator.py https://data.commoncrawl.org/<HYPOTHETICAL-PREFIX>/whirlwind.warc.wat.gz
32+ iterate-remote :
33+ @echo " iterating over the remote WARC file over HTTPS"
34+ python ./warcio-iterator.py $(GITHUB_WHIRLWIND_WARC_HTTPS )
5135
5236cdxj :
5337 @echo " creating *.cdxj index files from the local warcs"
5438 cdxj-indexer whirlwind.warc.gz > whirlwind.warc.cdxj
5539 cdxj-indexer --records conversion whirlwind.warc.wet.gz > whirlwind.warc.wet.cdxj
5640 cdxj-indexer whirlwind.warc.wat.gz > whirlwind.warc.wat.cdxj
5741
42+ cdxj-remote :
43+ @echo " indexing End-of-Term 2024 Internet Archive WARC file over HTTPS (Filesize ~1GB, first 10 records are shown):"
44+ cdxj-indexer $(EOT_IA_WARC_HTTPS ) 2> /dev/null | head -n 10
45+ @echo
46+ @echo " indexing End-of-Term 2024 Common Crawl WARC file over S3 (Filesize ~1GB, first 10 records are shown):"
47+ cdxj-indexer $(EOT_CC_WARC_S3 ) 2> /dev/null | head -n 10
48+
5849extract :
5950 @echo " creating extraction.* from local warcs, the offset numbers are from the cdxj index"
6051 warcio extract --payload whirlwind.warc.gz 1023 > extraction.html
6152 warcio extract --payload whirlwind.warc.wet.gz 466 > extraction.txt
6253 warcio extract --payload whirlwind.warc.wat.gz 443 > extraction.json
6354 @echo " hint: python -m json.tool extraction.json"
6455
56+ extract-remote :
57+ @echo " extracting hpxml.nrel.gov record from End-of-Term 2024 Internet Archive warc over HTTPS (offset 50755 from cdxj index):"
58+ warcio extract $(EOT_IA_WARC_HTTPS ) 50755
59+ @echo
60+ @echo " extracting before-you-ship.18f.gov record from End-of-Term 2024 Common Crawl warc over S3 (offset 18595 from cdxj index):"
61+ warcio extract $(EOT_CC_WARC_S3 ) 18595
62+
6563cdx_toolkit :
6664 @echo demonstrate that we have this entry in the index
6765 cdxt --crawl CC-MAIN-2024-22 --from 20240518015810 --to 20240518015810 iter an.wikipedia.org/wiki/Escopete
0 commit comments