Skip to content

Commit 3c04be3

Browse files
authored
Update DuckDB queries and parquet file loading (#24)
* fix: load the parquet files recursively and use the query of duckdb to select the crawl and the subset, as was done in the Java Tour * chore(docs): update documentation with instruction on how to download the crawl data with and without the AWS CLI * fix: parametrize the crawl name * fix: remove scripts and AWS in favour of cc-downloader * fix: refer to the cc-downloader repo in case cargo is not available * docs: more details on cc-downloader * feat: update index_download_advice to recommend cc-downloader and check local files * fix: trailing slash
1 parent 08dcfbe commit 3c04be3

File tree

3 files changed

+75
-22
lines changed

3 files changed

+75
-22
lines changed

Makefile

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,11 +62,14 @@ CC-MAIN-2024-22.warc.paths.gz:
6262
aws s3 ls s3://commoncrawl/cc-index/table/cc-main/warc/crawl=CC-MAIN-2024-22/subset=warc/ | awk '{print $$4}' | gzip -9 > CC-MAIN-2024-22.warc.paths.gz
6363

6464
duck_local_files:
65+
ifndef LOCAL_DIR
66+
$(error LOCAL_DIR is required. Usage: make duck_local_files LOCAL_DIR=/path/to/data)
67+
endif
6568
@echo "warning! 300 gigabyte download"
66-
python duck.py local_files
69+
python duck.py local_files $(LOCAL_DIR)
6770

6871
duck_ccf_local_files:
69-
@echo "warning! only works on Common Crawl Foundadtion's development machine"
72+
@echo "warning! only works on Common Crawl Foundation's development machine"
7073
python duck.py ccf_local_files
7174

7275
duck_cloudfront:

README.md

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -546,9 +546,46 @@ The program then writes that one record into a local Parquet file, does a second
546546

547547
### Bonus: download a full crawl index and query with DuckDB
548548

549-
If you want to run many of these queries, and you have a lot of disk space, you'll want to download the 300 gigabyte index and query it repeatedly. Run
549+
In case you want to run many of these queries, and you have a lot of disk space, you'll want to download the 300 gigabyte index and query it repeatedly.
550550

551-
```make duck_local_files```
551+
> [!IMPORTANT]
552+
> If you happen to be using the Common Crawl Foundation development server, we've already downloaded these files, and you can run ```make duck_ccf_local_files```
553+
554+
To download the crawl index, please use [cc-downloader](https://github.com/commoncrawl/cc-downloader), which is the official and recommended downloader for Common Crawl data.
555+
556+
The simplest way to install `cc-downloader` is through cargo, the Rust package manager. If you have Rust installed, you can run:
557+
558+
```shell
559+
cargo install cc-downloader
560+
```
561+
562+
> [!WARNING]
563+
> `cc-downloader` will not be set up on your path by default, but you can run it by prepending the right path.
564+
565+
If cargo is not available or does not install, you can download the binaries, please check on [the cc-downloader official repository](https://github.com/commoncrawl/cc-downloader).
566+
567+
```shell
568+
mkdir crawl
569+
~/.cargo/bin/cc-downloader download-paths CC-MAIN-2024-22 cc-index-table crawl
570+
~/.cargo/bin/cc-downloader download crawl/cc-index-table.paths.gz --progress crawl
571+
```
572+
573+
In both ways, the file structure should be something like this:
574+
```shell
575+
tree crawl/
576+
crawl/
577+
├── cc-index
578+
│ └── table
579+
│ └── cc-main
580+
│ └── warc
581+
│ └── crawl=CC-MAIN-2024-22
582+
│ └── subset=warc
583+
│ ├── part-00000-4dd72944-e9c0-41a1-9026-dfd2d0615bf2.c000.gz.parquet
584+
│ ├── part-00000-4dd72944-e9c0-41a1-9026-dfd2d0615bf2.c001.gz.parquet
585+
```
586+
587+
588+
Then, you can run `make duck_local_files LOCAL_DIR=/path/to/the/downloaded/data` to run the same query as above, but this time using your local copy of the index files.
552589

553590
If the files aren't already downloaded, this command will give you
554591
download instructions.

duck.py

Lines changed: 31 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1+
from pathlib import Path
2+
13
import time
2-
import glob
34
import json
45
import os.path
56
import sys
@@ -10,11 +11,13 @@
1011
import duckdb
1112

1213

13-
def index_download_advice(prefix, crawl):
14+
def index_download_advice(local_prefix, crawl):
1415
print('Do you need to download this index?')
15-
print(f' mkdir -p {prefix}/commmoncrawl/cc-index/table/cc-main/warc/crawl={crawl}/subset=warc/')
16-
print(f' cd {prefix}/commmoncrawl/cc-index/table/cc-main/warc/crawl={crawl}/subset=warc/')
17-
print(f' aws s3 sync s3://commoncrawl/cc-index/table/cc-main/warc/crawl={crawl}/subset=warc/ .')
16+
print('The recommended way is to use cc-downloader https://github.com/commoncrawl/cc-downloader')
17+
print('If you have cargo and Rust already installed: `cargo install cc-downloader` '
18+
'(alternatively, the binaries are available on the GitHub repository) , and then, ')
19+
print(f' ~/.cargo/bin/cc-downloader download-paths {crawl} cc-index-table {local_prefix}')
20+
print(f' ~/.cargo/bin/cc-downloader download {local_prefix}/cc-index-table.paths.gz --progress {local_prefix}')
1821

1922

2023
def print_row_as_cdxj(row):
@@ -48,20 +51,22 @@ def print_row_as_kv_list(row):
4851
all_algos = ('s3_glob', 'local_files', 'ccf_local_files', 'cloudfront_glob', 'cloudfront')
4952

5053

51-
def get_files(algo, crawl):
54+
def get_files(algo, crawl, local_prefix=None):
5255
if algo == 's3_glob':
5356
# 403 errors with and without credentials. you have to be commoncrawl-pds
5457
files = f's3://commoncrawl/cc-index/table/cc-main/warc/crawl={crawl}/subset=warc/*.parquet'
5558
raise NotImplementedError('will cause a 403')
5659
elif algo == 'local_files':
57-
files = os.path.expanduser(f'~/commmoncrawl/cc-index/table/cc-main/warc/crawl={crawl}/subset=warc/*.parquet')
58-
files = glob.glob(files)
59-
# did we already download? we expect 300 files of about a gigabyte
60+
files = [str(f) for f in Path(os.path.expanduser(f'{local_prefix}')).rglob('*.parquet')]
61+
# Check whether the local files have been already downloaded.
62+
# We expect 300 files of about a gigabyte
6063
if len(files) < 250:
61-
index_download_advice('~', crawl)
64+
index_download_advice(local_prefix, crawl)
6265
exit(1)
6366
elif algo == 'ccf_local_files':
64-
files = glob.glob(f'/home/cc-pds/commoncrawl/cc-index/table/cc-main/warc/crawl={crawl}/subset=warc/*.parquet')
67+
files = [str(f) for f in Path(f'/home/cc-pds/commoncrawl/cc-index/table/cc-main/warc').rglob('*.parquet')]
68+
# Check whether the local files have been already downloaded
69+
# We expect 300 files of about a gigabyte
6570
if len(files) < 250:
6671
index_download_advice('/home/cc-pds', crawl)
6772
exit(1)
@@ -81,12 +86,12 @@ def get_files(algo, crawl):
8186
return files
8287

8388

84-
def main(algo, crawl):
89+
def main(algo, crawl, local_prefix=None):
8590
windows = True if platform.system() == 'Windows' else False
8691
if windows:
8792
# windows stdout is often cp1252
8893
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
89-
files = get_files(algo, crawl)
94+
files = get_files(algo, crawl, local_prefix)
9095
retries_left = 100
9196

9297
while True:
@@ -112,7 +117,7 @@ def main(algo, crawl):
112117
retries_left = 100
113118
while True:
114119
try:
115-
print(duckdb.sql('SELECT COUNT(*) FROM ccindex;'))
120+
print(duckdb.sql(f"SELECT COUNT(*) FROM ccindex WHERE subset = 'warc' and crawl = '{crawl}';"))
116121
break
117122
except duckdb.InvalidInputException as e:
118123
# duckdb.duckdb.InvalidInputException: Invalid Input Error: No magic bytes found at end of file 'https://...'
@@ -124,17 +129,17 @@ def main(algo, crawl):
124129
else:
125130
raise
126131

127-
sq2 = f'''
132+
sq2 = f"""
128133
select
129134
*
130135
from ccindex
131136
where subset = 'warc'
132-
and crawl = 'CC-MAIN-2024-22'
137+
and crawl = '{crawl}'
133138
and url_host_tld = 'org' -- help the query optimizer
134139
and url_host_registered_domain = 'wikipedia.org' -- ditto
135140
and url = 'https://an.wikipedia.org/wiki/Escopete'
136141
;
137-
'''
142+
"""
138143

139144
row2 = duckdb.sql(sq2)
140145
print('our one row')
@@ -175,13 +180,21 @@ def main(algo, crawl):
175180

176181
if __name__ == '__main__':
177182
crawl = 'CC-MAIN-2024-22'
183+
local_prefix = None
178184
if len(sys.argv) > 1:
179185
algo = sys.argv[1]
180186
if algo == 'help':
181187
print('possible algos:', all_algos)
182188
exit(1)
189+
elif algo == 'local_files':
190+
if len(sys.argv) < 2:
191+
print('for local_files algo, you must provide a local prefix as the second argument')
192+
exit(1)
193+
else:
194+
local_prefix = sys.argv[2]
195+
print(f"Using local prefix {local_prefix}")
183196
else:
184197
algo = 'cloudfront'
185198
print('using algo: ', algo)
186199

187-
main(algo, crawl)
200+
main(algo, crawl, local_prefix)

0 commit comments

Comments
 (0)