-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathretrieve_gutenberg.py
More file actions
1704 lines (1445 loc) · 69.1 KB
/
retrieve_gutenberg.py
File metadata and controls
1704 lines (1445 loc) · 69.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
"""
Retrieve and filter Project Gutenberg texts for training-data preparation.
This script downloads books from Project Gutenberg based on predefined
priority works and subject filters. It saves the corpus in JSONL format.
All works are filtered to ensure publication before 1969 (pre-moon landing)
to maintain temporal consistency with the Soviet utopia aesthetic.
The script is fully **idempotent** — re-running it will skip works that
already exist in the output files and append only new ones. Use --reset
to wipe previous data and start fresh.
Retrieval is via direct HTTP download from gutenberg.org, with subject-based
search done by scraping the Gutenberg website.
Environment Variables:
GUTENBERG_DATA: Base directory for output (default: output/gutenberg_corpus)
"""
import getpass
import grp
import json
import os
import pwd
import re
import requests
import subprocess
import sys
import tempfile
import time
from bs4 import BeautifulSoup
from urllib.parse import quote
try:
from tqdm import tqdm
except ImportError:
tqdm = None
# Temporal cutoff: All works must be published before 1969 (moon landing year)
TEMPORAL_CUTOFF_YEAR = 1969
# ---------------------------------------------------------------------------
# Directory bootstrap
# ---------------------------------------------------------------------------
def _ensure_directory(path: str) -> None:
"""Create *path* (and parents) and verify the current user can write to it.
The function is fully idempotent:
- If the directory already exists and is writable, it is a no-op.
- If it exists but is owned by another user, we attempt a recursive
``chown`` to the current user. When that fails (no permission) we
print the exact ``sudo`` command the operator should run and exit.
- If it does not exist we create it; when that fails we again print
the ``sudo`` command and exit.
"""
uid = os.getuid()
user = getpass.getuser()
gid = os.getgid()
group = grp.getgrgid(gid).gr_name
# --- create if missing ------------------------------------------------
if not os.path.isdir(path):
try:
os.makedirs(path, exist_ok=True)
except PermissionError:
print(f"\nError: cannot create {path} — permission denied.")
print(f"Run the following command first, then retry:\n")
print(f" sudo mkdir -p {path} && sudo chown -R {user}:{group} {path}\n")
sys.exit(1)
# --- check / fix ownership --------------------------------------------
dir_stat = os.stat(path)
if dir_stat.st_uid != uid:
# Attempt to take ownership
try:
subprocess.check_call(
['chown', '-R', f'{user}:{group}', path],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
print(f"Aligned ownership of {path} to {user}:{group}")
except (subprocess.CalledProcessError, PermissionError):
print(f"\nError: {path} is owned by "
f"uid {dir_stat.st_uid} ({pwd.getpwuid(dir_stat.st_uid).pw_name}), "
f"not {user}.")
print(f"Run the following command first, then retry:\n")
print(f" sudo chown -R {user}:{group} {path}\n")
sys.exit(1)
# --- writability probe ------------------------------------------------
probe = os.path.join(path, '.write_test')
try:
with open(probe, 'w') as f:
f.write('ok')
os.unlink(probe)
except PermissionError:
print(f"\nError: {path} exists but is not writable by {user}.")
print(f"Run the following command first, then retry:\n")
print(f" sudo chown -R {user}:{group} {path} && sudo chmod -R u+rwX {path}\n")
sys.exit(1)
class GutenbergRetriever:
"""Retrieve and filter Project Gutenberg texts for training-data preparation."""
# Target Gutenberg IDs for priority works, organised by category
PRIORITY_WORKS_BY_CATEGORY = {
'Utopian/Dystopian': {
624: "Looking Backward", # Edward Bellamy
6424: "A Modern Utopia", # H.G. Wells
3261: "News from Nowhere", # William Morris
1164: "The Iron Heel", # Jack London — oligarchy, class struggle
32: "Herland", # Charlotte Perkins Gilman
61963: "We", # Yevgeny Zamyatin — THE Soviet dystopia
12163: "The Sleeper Awakes", # H.G. Wells — future society, class revolt
1497: "The Republic", # Plato — ideal society, philosopher-kings
1998: "Thus Spake Zarathustra", # Nietzsche — übermensch, will to power
},
'Russian Literature': {
2554: "Crime and Punishment", # Dostoevsky
28054: "The Brothers Karamazov", # Dostoevsky — free will, morality
600: "Notes from the Underground", # Dostoevsky — alienation, consciousness
2638: "The Idiot", # Dostoevsky
8117: "The Possessed", # Dostoevsky — revolutionaries, nihilism
1399: "Anna Karenina", # Tolstoy
2600: "War and Peace", # Tolstoy
3783: "Mother", # Maxim Gorky — revolutionary spirit
47935: "Fathers and Sons", # Turgenev — generational conflict, nihilism
1081: "Dead Souls", # Gogol
7986: "Plays by Anton Chekhov, Second Series", # Contains The Cherry Orchard, Three Sisters
1756: "Uncle Vanya", # Chekhov
1754: "The Seagull", # Chekhov
2197: "The Gambler", # Dostoevsky — obsession, fate
},
'Early Science Fiction': {
83: "From the Earth to the Moon", # Jules Verne (combined with Round the Moon)
164: "20,000 Leagues Under the Sea", # Jules Verne
18857: "Journey to the Center of the Earth", # Jules Verne (full text version)
1268: "The Mysterious Island", # Jules Verne — survival, self-sufficiency
35: "The Time Machine", # H.G. Wells — class divide, evolution
36: "The War of the Worlds", # H.G. Wells — alien invasion
1013: "The First Men in the Moon", # H.G. Wells — lunar colony
5230: "The Invisible Man", # H.G. Wells
159: "The Island of Doctor Moreau", # H.G. Wells — playing god
62: "A Princess of Mars", # Burroughs — Mars adventure
64: "The Gods of Mars", # Burroughs — Mars sequel
72: "Thuvia, Maid of Mars", # Burroughs — Mars series
1153: "The Chessmen of Mars", # Burroughs — Mars + chess (Jetan), thematic overlap
139: "The Lost World", # Doyle — isolated civilization
59112: "R.U.R.", # Čapek — robots, AI rebellion
61213: "The 64-Square Madhouse", # Fritz Leiber — sci-fi, computer plays chess (1962)
84: "Frankenstein", # Shelley — created intelligence
1059: "The World Set Free", # H.G. Wells — atomic war, world government
11696: "The Food of the Gods", # H.G. Wells — scientific hubris
},
'Political Philosophy': {
61: "The Communist Manifesto", # Marx/Engels
4341: "Mutual Aid", # Kropotkin — cooperation vs competition
23428: "The Conquest of Bread", # Kropotkin — anarcho-communism
1232: "The Prince", # Machiavelli — power, statecraft
815: "Democracy in America Vol 1", # Tocqueville — political systems
816: "Democracy in America Vol 2", # Tocqueville
3207: "Leviathan", # Hobbes — social contract, sovereignty
46333: "The Social Contract", # Rousseau — general will
},
'Isolation/Survival': {
521: "Robinson Crusoe", # Defoe — survival, self-reliance
1184: "The Count of Monte Cristo", # Dumas — imprisonment, revenge
30197: "Farthest North Vol I", # Nansen — polar exploration, survival
34120: "Farthest North Vol II", # Nansen — polar exploration, survival
},
'Chess & Strategy': {
33870: "Chess Fundamentals", # Capablanca — strategy, endgames, annotated games
5614: "Chess Strategy", # Edward Lasker — opening theory, middlegame, endgame
4913: "Chess and Checkers: the Way to Mastership", # Edward Lasker — rules, strategy
16377: "The Blue Book of Chess", # Howard Staunton — rules, openings, game annotations
34180: "The Exploits and Triumphs of Paul Morphy", # Frederick M. Edge — biography, annotated games
4902: "Chess History and Reminiscences", # H.E. Bird — historical survey, anecdotes
55278: "Chess Generalship, Vol. I: Grand Reconnaissance", # Franklin K. Young — strategic principles
10672: "Game and Playe of the Chesse", # William Caxton — earliest printed chess text in English (1474)
4542: "Checkmates for Three Pieces", # W.B. Fishburne — tactical patterns
4656: "Checkmates for Four Pieces", # W.B. Fishburne — tactical patterns
39445: "Hoyle's Games Modernized", # Prof. Hoffmann / Edmond Hoyle — rules and strategy
36821: "Maxims and Hints on Angling, Chess, Shooting", # Richard Penn — chess maxims
60420: "Observations on the Automaton Chess Player", # Oxford graduate (~1819) — Mechanical Turk, proto-AI
61410: "An Attempt to Analyse the Automaton Chess Player", # Robert Willis (1821) — Mechanical Turk analysis
64061: "War-Chess, or the Game of Battle", # Charles Richardson — kriegsspiel, chess-derived war strategy
63660: "The Game of Chess: A Play in One Act", # Kenneth Sawyer Goodman (1914) — chess-themed drama
},
'Satire': {
1695: "The Man Who Was Thursday", # Chesterton — anarchists, conspiracy
829: "Gulliver's Travels", # Swift — political satire
1080: "A Modest Proposal", # Swift — savage satire
19942: "Candide", # Voltaire — satirical philosophy
},
}
# Flat dict for backward compatibility: {id: title}
PRIORITY_WORKS = {
gid: title
for category_works in PRIORITY_WORKS_BY_CATEGORY.values()
for gid, title in category_works.items()
}
# Subject filters for bulk retrieval
# Uses Library of Congress Subject Headings (LCSH) terminology
# Aligned with Deep Red themes: Soviet Mars colony, AI chess master,
# political satire, survival, ideological extremism
SUBJECT_FILTERS = [
# Fiction genres
"Science fiction",
"Satire",
"Political fiction",
"Allegories",
"Utopias",
"Dystopias",
# Soviet/Russian themes
"Soviet Union",
"Russia",
"Socialism",
"Communism",
"Propaganda",
"Totalitarianism",
"Collectivism",
# Space and Mars
"Space flight",
"Mars (Planet)",
"Interplanetary voyages",
"Space colonies",
"Life on other planets",
"Outer space",
"Astronautics",
# AI/Machine/Chess themes
"Chess",
"Automata",
"Automaton chess players",
"Chess -- Early works to 1800",
"War chess (Game)",
"Machinery",
"Robots",
"Calculating machines",
# Survival and isolation
"Survival",
"Wilderness survival",
"Shipwrecks", # Keep — analogous to crash survival
"Castaways",
"Prisoners",
"Exiles",
# Political/Social conflict
"Revolutions",
"Political science",
"Secret societies",
"Conspiracies",
"Dictatorship",
"Oligarchy",
"Anarchism",
"Radicalism",
# Class and power
"Capitalism",
"Rich and poor",
"Working class",
"Labor",
"Power (Social sciences)",
# Human condition themes
"Human evolution",
"Evolution",
"Civilization",
"Future life",
"End of the world",
"Prophecies",
# Colonisation/Exploration
"Colonization",
"Explorers",
"Pioneers",
"Frontier and pioneer life",
# Psychology/Philosophy
"Free will and determinism",
"Man-machine systems",
"Good and evil",
]
# Known pre-1969 author death dates for validation
# Authors who died before 1969 guarantee pre-1969 works
KNOWN_PRE1969_AUTHORS = {
"Wells, H. G. (Herbert George), 1866-1946",
"Verne, Jules, 1828-1905",
"Dostoevsky, Fyodor, 1821-1881",
"Tolstoy, Leo, graf, 1828-1910",
"Chekhov, Anton Pavlovich, 1860-1904",
"Bellamy, Edward, 1850-1898",
"Morris, William, 1834-1896",
"London, Jack, 1876-1916",
"Gilman, Charlotte Perkins, 1860-1935",
"Shelley, Mary Wollstonecraft, 1797-1851",
"Marx, Karl, 1818-1883",
"Engels, Friedrich, 1820-1895",
"Kropotkin, Petr Alekseevich, kniaz, 1842-1921",
"Burroughs, Edgar Rice, 1875-1950",
"Doyle, Arthur Conan, Sir, 1859-1930",
"Gorky, Maksim, 1868-1936",
"Turgenev, Ivan Sergeevich, 1818-1883",
"Gogol, Nikolai Vasilevich, 1809-1852",
"Zamyatin, Evgeny Ivanovich, 1884-1937",
"Čapek, Karel, 1890-1938",
"Capablanca, José Raúl, 1888-1942",
"Plato, 428 BC-348 BC",
"Nietzsche, Friedrich Wilhelm, 1844-1900",
"Machiavelli, Niccolò, 1469-1527",
"Tocqueville, Alexis de, 1805-1859",
"Hobbes, Thomas, 1588-1679",
"Rousseau, Jean-Jacques, 1712-1778",
"Defoe, Daniel, 1661-1731",
"Dumas, Alexandre, 1802-1870",
"Nansen, Fridtjof, 1861-1930",
"Franklin, Benjamin, 1706-1790",
"Lasker, Edward, 1885-1981", # Note: died 1981 but chess book is 1915
"Staunton, Howard, 1810-1874",
"Bird, H. E. (Henry Edward), 1830-1908",
"Edge, Frederick Milnes, 1830-1895",
"Caxton, William, approximately 1422-1491",
"Leiber, Fritz, 1910-1992",
"Goodman, Kenneth Sawyer, 1883-1918",
"Willis, Robert, 1800-1875",
"Richardson, Charles, -1913", # War-Chess author
"Chesterton, G. K. (Gilbert Keith), 1874-1936",
"Swift, Jonathan, 1667-1745",
"Voltaire, 1694-1778",
}
def __init__(self, output_dir: str, max_year: int = TEMPORAL_CUTOFF_YEAR, prefer_http: bool = True,
verbose: bool = False):
"""Initialise the retriever.
Args:
output_dir: Directory to save retrieved works
max_year: Maximum publication year (temporal cutoff)
prefer_http: If True, use HTTP for individual works (faster, more reliable).
Library is still used for subject search if available.
verbose: If True, print detailed per-work output instead of progress bars.
"""
self.output_dir = output_dir
self.max_year = max_year
self.prefer_http = prefer_http
self.verbose = verbose
# Create output directory tree and verify writability
_ensure_directory(output_dir)
self.retrieved_ids: set[int] = set() # Track already retrieved IDs to avoid duplicates
self.rejected_ids: dict[int, str] = {} # Track previously rejected IDs {id: reason}
# Load existing IDs from corpus files to avoid re-downloading
self._load_existing_corpus_ids()
self._load_rejected_ids()
# ------------------------------------------------------------------
# Idempotency helpers
# ------------------------------------------------------------------
def _load_existing_corpus_ids(self):
"""Load IDs of works already in the corpus files to avoid duplicates."""
corpus_files = [
os.path.join(self.output_dir, 'gutenberg_corpus.jsonl'),
os.path.join(self.output_dir, 'priority_works.jsonl'),
]
for corpus_file in corpus_files:
if os.path.exists(corpus_file):
try:
with open(corpus_file, 'r', encoding='utf-8') as f:
for line in f:
if line.strip():
work = json.loads(line)
if 'id' in work:
self.retrieved_ids.add(work['id'])
if self.verbose:
print(f"Loaded {len(self.retrieved_ids)} existing work IDs from {os.path.basename(corpus_file)}")
except Exception as e:
print(f"Warning: Could not load existing corpus from {corpus_file}: {e}")
def _load_rejected_ids(self):
"""Load IDs previously rejected (non-English, bad date, etc.) to skip on re-run."""
path = os.path.join(self.output_dir, 'rejected_ids.json')
if os.path.exists(path):
try:
with open(path, 'r', encoding='utf-8') as f:
data = json.load(f)
self.rejected_ids = {int(k): v for k, v in data.items()}
if self.verbose:
print(f"Loaded {len(self.rejected_ids)} previously rejected work IDs")
except Exception as e:
print(f"Warning: Could not load rejected IDs from {path}: {e}")
def _save_rejected_ids(self):
"""Persist rejected IDs to disk (atomic write)."""
path = os.path.join(self.output_dir, 'rejected_ids.json')
fd, tmp = tempfile.mkstemp(suffix='.tmp', dir=self.output_dir,
prefix='.rejected_ids.')
try:
with os.fdopen(fd, 'w', encoding='utf-8') as f:
json.dump({str(k): v for k, v in self.rejected_ids.items()}, f,
ensure_ascii=False, indent=1)
os.replace(tmp, path)
except BaseException:
try:
os.unlink(tmp)
except OSError:
pass
raise
def _reject(self, gutenberg_id: int, reason: str) -> None:
"""Record a rejected work and persist immediately."""
self.rejected_ids[gutenberg_id] = reason
self._save_rejected_ids()
def status(self) -> dict:
"""Return a summary of existing corpus data in the output directory.
Useful for checking state before a run and confirming idempotency.
"""
info: dict = {
'output_dir': self.output_dir,
'existing_ids': len(self.retrieved_ids),
'rejected_ids': len(self.rejected_ids),
'files': {},
}
for name in ('gutenberg_corpus.jsonl', 'priority_works.jsonl'):
path = os.path.join(self.output_dir, name)
if os.path.exists(path):
size = os.path.getsize(path)
count = 0
total_chars = 0
with open(path, 'r', encoding='utf-8') as f:
for line in f:
if line.strip():
try:
w = json.loads(line)
count += 1
total_chars += w.get('length', 0)
except json.JSONDecodeError:
pass
info['files'][name] = {
'works': count,
'size_mb': round(size / (1024 * 1024), 1),
'total_chars': total_chars,
}
return info
# ------------------------------------------------------------------
# Text processing
# ------------------------------------------------------------------
def strip_gutenberg_headers(self, text: str) -> str:
"""Remove Project Gutenberg headers and footers from text."""
# Start marker patterns
start_patterns = [
r'\*\*\* START OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*',
r'\*\*\* START OF THE PROJECT GUTENBERG EBOOK .* \*\*\*',
r'START OF THIS PROJECT GUTENBERG EBOOK',
r'START OF THE PROJECT GUTENBERG EBOOK',
]
# End marker patterns
end_patterns = [
r'\*\*\* END OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*',
r'\*\*\* END OF THE PROJECT GUTENBERG EBOOK .* \*\*\*',
r'END OF THIS PROJECT GUTENBERG EBOOK',
r'END OF THE PROJECT GUTENBERG EBOOK',
]
# Find start position
start_pos = 0
for pattern in start_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
start_pos = match.end()
break
# Find end position
end_pos = len(text)
for pattern in end_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
end_pos = match.start()
break
return text[start_pos:end_pos].strip()
def _is_english_text(self, text: str, header: str = None) -> bool:
"""Check if text is in English.
Uses multiple heuristics:
1. Check Language field in Gutenberg header
2. Check ratio of ASCII characters (English text is mostly ASCII)
3. Check for common English words
Args:
text: The main text content
header: Optional header text to check for Language field
Returns:
True if text appears to be English, False otherwise
"""
# Check header for explicit language declaration
if header:
lang_match = re.search(r'Language:\s*(\w+)', header, re.IGNORECASE)
if lang_match:
language = lang_match.group(1).lower()
if language != 'english':
return False
return True
# Sample the first 5000 characters for analysis
sample = text[:5000] if len(text) > 5000 else text
# Count ASCII vs non-ASCII characters (excluding whitespace)
ascii_chars = sum(1 for c in sample if c.isalpha() and ord(c) < 128)
non_ascii_chars = sum(1 for c in sample if c.isalpha() and ord(c) >= 128)
total_alpha = ascii_chars + non_ascii_chars
if total_alpha == 0:
return False
ascii_ratio = ascii_chars / total_alpha
# English text should be >95% ASCII letters
# Non-English (Chinese, Japanese, Finnish, etc.) will have much lower ratios
if ascii_ratio < 0.90:
return False
# Additional check: look for common English words
common_english = ['the', 'and', 'of', 'to', 'a', 'in', 'is', 'that', 'it', 'was']
sample_lower = sample.lower()
english_word_count = sum(1 for word in common_english if f' {word} ' in sample_lower)
# Should find at least 3 common English words in the sample
if english_word_count < 3:
return False
return True
def _titles_match(self, expected: str, actual: str) -> bool:
"""Check if actual title matches expected title.
Uses fuzzy matching to handle minor variations like:
- Subtitle differences: "We" vs "We: A Novel"
- Article variations: "The Time Machine" vs "Time Machine, The"
- Punctuation differences
Args:
expected: The title we expect (from PRIORITY_WORKS)
actual: The title extracted from the downloaded content
Returns:
True if titles match sufficiently
"""
if not expected or not actual:
return False
# Normalise both titles for comparison
def normalize(title: str) -> str:
# Lowercase
t = title.lower()
# Remove leading articles
for article in ['the ', 'a ', 'an ']:
if t.startswith(article):
t = t[len(article):]
# Remove trailing articles (e.g., ", The")
for article in [', the', ', a', ', an']:
if t.endswith(article):
t = t[:-len(article)]
# Remove subtitles (after colon or dash)
t = re.split(r'[:\-—–]', t)[0]
# Remove punctuation and extra whitespace
t = re.sub(r'[^\w\s]', '', t)
t = ' '.join(t.split())
return t.strip()
norm_expected = normalize(expected)
norm_actual = normalize(actual)
# Exact match after normalisation
if norm_expected == norm_actual:
return True
# Check if one contains the other (for subtitle variations)
if norm_expected in norm_actual or norm_actual in norm_expected:
return True
# Check word overlap (at least 80% of expected words present)
expected_words = set(norm_expected.split())
actual_words = set(norm_actual.split())
if expected_words and len(expected_words & actual_words) / len(expected_words) >= 0.8:
return True
return False
# ------------------------------------------------------------------
# HTTP retrieval
# ------------------------------------------------------------------
def retrieve_by_http(self, gutenberg_id: int, title: str) -> dict:
"""Retrieve a work directly via HTTP (fallback method).
Tries multiple strategies in order:
1. Standard plain-text URL patterns
2. Additional text URLs discovered from RDF metadata
3. Additional static text URL patterns (-8.txt encoding variant)
4. HTML-to-text conversion from RDF-listed HTML files
5. HTML-to-text conversion from common HTML URL patterns
6. Scrape the ebook download page for any text/HTML links
"""
try:
# First, fetch RDF metadata for better date detection
rdf_metadata = self._fetch_gutenberg_rdf(gutenberg_id)
# ---- Strategy 1: Standard plain-text URL patterns ----
urls = [
f"https://www.gutenberg.org/cache/epub/{gutenberg_id}/pg{gutenberg_id}.txt",
f"https://www.gutenberg.org/files/{gutenberg_id}/{gutenberg_id}-0.txt",
f"https://www.gutenberg.org/files/{gutenberg_id}/{gutenberg_id}.txt",
]
# ---- Strategy 2: Text URLs from RDF metadata ----
if rdf_metadata:
for rdf_url in rdf_metadata.get('text_urls', []):
if rdf_url not in urls:
urls.append(rdf_url)
# ---- Strategy 3: Additional static text URL patterns ----
extra_txt = f"https://www.gutenberg.org/files/{gutenberg_id}/{gutenberg_id}-8.txt"
if extra_txt not in urls:
urls.append(extra_txt)
text = None
retrieval_method = 'http'
for url in urls:
print(f" Trying {url}")
try:
response = requests.get(url, timeout=30)
if response.status_code == 200:
candidate = response.text
# Sanity: reject tiny responses (< 500 chars) that are
# likely index pages or readme stubs, not actual books
if len(candidate) >= 500:
text = candidate
break
except requests.RequestException:
continue
# ---- Strategy 4–6: HTML fallback ----
if not text:
text = self._try_html_fallback(gutenberg_id, rdf_metadata)
if text:
retrieval_method = 'http_html'
if not text:
print(f" Could not retrieve from any URL")
return None
# Check if text is in English (skip non-English content)
header = text[:3000]
if not self._is_english_text(text, header):
# Try to extract title for logging
title_match = re.search(r'Title:\s*(.+)', header)
display_title = title_match.group(1).strip() if title_match else title
print(f" Skipping non-English content: {display_title}")
self._reject(gutenberg_id, f"non-English: {display_title}")
return None
# Extract title — prefer RDF, then text header, then provided
extracted_title = title
if rdf_metadata and rdf_metadata.get('title'):
extracted_title = rdf_metadata['title']
elif title.startswith("Unknown_"):
title_match = re.search(r'Title:\s*(.+)', text[:2000])
if title_match:
extracted_title = title_match.group(1).strip()
# Validate title matches expected (for priority works)
if not title.startswith("Unknown_") and not self._titles_match(title, extracted_title):
print(f" Title mismatch! Expected: '{title}', Got: '{extracted_title}'")
self._reject(gutenberg_id, f"title-mismatch: expected '{title}', got '{extracted_title}'")
return None
# Try to extract publication year — multiple sources (ordered by reliability)
pub_year = None
author_death_year = None
# Get author death year from RDF (useful for validation)
if rdf_metadata:
author_death_year = rdf_metadata.get('author_death_year')
# 1. Check text header for explicit publication patterns (most reliable)
pub_year = self._extract_year_from_text_header(text)
# 2. Check RDF description for time period
if not pub_year and rdf_metadata:
pub_year = self._extract_year_from_description(rdf_metadata.get('description', ''))
# 3. Check title for year (lowest priority — only for reference/annual works)
# This is last because titles like "1984" are misleading (written 1949)
if not pub_year:
pub_year = self._extract_year_from_title(extracted_title)
# Strip headers/footers
cleaned_text = self.strip_gutenberg_headers(text)
# Extract author — prefer RDF, then text header
author = "Unknown"
if rdf_metadata and rdf_metadata.get('author'):
author = rdf_metadata['author']
else:
author_match = re.search(r'Author:\s*(.+)', text[:2000])
if author_match:
author = author_match.group(1).strip()
result = {
'id': gutenberg_id,
'title': extracted_title,
'author': author,
'text': cleaned_text,
'length': len(cleaned_text),
'pub_year': pub_year,
'method': retrieval_method
}
# Add author death year if found (useful for temporal validation)
if author_death_year:
result['author_death_year'] = author_death_year
return result
except Exception as e:
print(f" Error with HTTP retrieval: {e}")
return None
# ------------------------------------------------------------------
# HTML-to-text fallback helpers
# ------------------------------------------------------------------
def _html_to_text(self, html: str) -> str:
"""Convert HTML content to plain text using BeautifulSoup.
Strips all tags, collapses whitespace, and removes navigation /
boilerplate elements commonly found in Gutenberg HTML files.
Args:
html: Raw HTML string
Returns:
Extracted plain text, or None if extraction fails or result
is too short to be a real book.
"""
try:
soup = BeautifulSoup(html, 'html.parser')
# Remove script, style, nav, and header/footer elements
for tag in soup.find_all(['script', 'style', 'nav', 'header', 'footer']):
tag.decompose()
# Extract text; use newlines as separators for block elements
text = soup.get_text(separator='\n')
# Collapse multiple blank lines
text = re.sub(r'\n{3,}', '\n\n', text)
text = text.strip()
# Must be substantial enough to be actual book content
# (10K chars ≈ 2000 words — filters out index/navigation pages)
if len(text) < 10000:
return None
return text
except Exception:
return None
def _try_html_fallback(self, gutenberg_id: int, rdf_metadata: dict) -> str:
"""Attempt to retrieve book text via HTML download + tag stripping.
Tries (in order):
4. HTML URLs listed in RDF metadata
5. Common Gutenberg HTML URL patterns
6. Scraping the ebook download page for text/HTML links
Args:
gutenberg_id: The Project Gutenberg ID
rdf_metadata: Previously fetched RDF metadata dict (may be None)
Returns:
Plain text extracted from HTML, or None on failure.
"""
html_urls = []
# ---- Strategy 4: HTML URLs from RDF metadata ----
if rdf_metadata:
html_urls.extend(rdf_metadata.get('html_urls', []))
# ---- Strategy 5: Common HTML URL patterns ----
common_html = [
f"https://www.gutenberg.org/cache/epub/{gutenberg_id}/pg{gutenberg_id}-images.html",
f"https://www.gutenberg.org/cache/epub/{gutenberg_id}/pg{gutenberg_id}.html",
f"https://www.gutenberg.org/files/{gutenberg_id}/{gutenberg_id}-h/{gutenberg_id}-h.htm",
f"https://www.gutenberg.org/files/{gutenberg_id}/{gutenberg_id}-h/{gutenberg_id}-h.html",
]
for url in common_html:
if url not in html_urls:
html_urls.append(url)
# De-duplicate while preserving order
seen = set()
deduped = []
for url in html_urls:
if url not in seen:
seen.add(url)
deduped.append(url)
html_urls = deduped
for url in html_urls:
print(f" Trying HTML fallback: {url}")
try:
response = requests.get(url, timeout=30)
if response.status_code == 200 and len(response.text) >= 500:
text = self._html_to_text(response.text)
if text:
print(f" Retrieved via HTML conversion")
return text
except requests.RequestException:
continue
# ---- Strategy 6: Scrape ebook download page ----
return self._try_ebook_page_fallback(gutenberg_id)
def _try_ebook_page_fallback(self, gutenberg_id: int) -> str:
"""Last-resort: scrape the Gutenberg ebook page for download links.
Parses ``https://www.gutenberg.org/ebooks/{id}`` looking for any
text/plain or text/html download links not yet tried.
Args:
gutenberg_id: The Project Gutenberg ID
Returns:
Plain text (possibly extracted from HTML), or None.
"""
page_url = f"https://www.gutenberg.org/ebooks/{gutenberg_id}"
print(f" Trying ebook page fallback: {page_url}")
try:
response = requests.get(page_url, timeout=30)
if response.status_code != 200:
return None
soup = BeautifulSoup(response.text, 'html.parser')
# Look for download links with content-type info
# Gutenberg marks them in <a> tags with a type= attribute
text_links = []
html_links = []
for a_tag in soup.find_all('a', href=True):
href = a_tag.get('href', '')
link_type = a_tag.get('type', '').lower()
link_text = a_tag.get_text(strip=True).lower()
# Build absolute URL
if href.startswith('/'):
href = f"https://www.gutenberg.org{href}"
elif not href.startswith('http'):
continue
# Skip audio, images, readme, zip, and rdf files
if any(ext in href.lower() for ext in [
'.ogg', '.mp3', '.wav', '.jpg', '.png', '.zip',
'readme', '.rdf', 'cover',
]):
continue
if 'text/plain' in link_type:
text_links.append(href)
elif 'text/html' in link_type:
html_links.append(href)
elif href.endswith('.txt') and 'readme' not in link_text:
text_links.append(href)
elif href.endswith(('.html', '.htm')) and 'index' not in link_text:
html_links.append(href)
# Try text links first
for url in text_links:
print(f" Trying scraped text link: {url}")
try:
resp = requests.get(url, timeout=30)
if resp.status_code == 200 and len(resp.text) >= 500:
return resp.text
except requests.RequestException:
continue
# Then HTML links
for url in html_links:
print(f" Trying scraped HTML link: {url}")
try:
resp = requests.get(url, timeout=30)
if resp.status_code == 200 and len(resp.text) >= 500:
text = self._html_to_text(resp.text)
if text:
print(f" Retrieved via ebook-page HTML conversion")
return text
except requests.RequestException:
continue
except Exception as e:
if self.verbose:
print(f" Ebook page fallback failed: {e}")
return None
# ------------------------------------------------------------------
# Year extraction helpers
# ------------------------------------------------------------------
def _extract_year_from_title(self, title: str) -> int:
"""Extract publication year from title if present.
Handles titles like:
- "The 2002 CIA World Factbook"
- "1984" (Orwell)
- "The Year 1920"
"""
if not title:
return None
# Look for 4-digit year in title
# Match years that look like publication dates (1800-2100)
year_matches = re.findall(r'\b(1[89]\d{2}|20\d{2}|21\d{2})\b', title)
for year_str in year_matches:
year = int(year_str)
# Filter out years that are likely not publication dates
# "1984" as a title is fine (it's the book name, published 1949)
# But "The 2002 CIA World Factbook" means published in 2002
if year >= 1900 and year <= 2100:
# Check if this looks like a factual/reference work with year in title
title_lower = title.lower()
if any(keyword in title_lower for keyword in [
'factbook', 'almanac', 'yearbook', 'annual', 'report',
'edition', 'volume', 'survey', 'census', 'statistics'
]):
return year
# Check for patterns like "The Year XXXX" or "XXXX Edition"
if re.search(rf'\b(year|edition|vol\.?|volume)\s*{year}\b', title, re.IGNORECASE):
return year
if re.search(rf'\b{year}\s*(edition|vol\.?|volume|annual|report)\b', title, re.IGNORECASE):
return year
return None
def _fetch_gutenberg_rdf(self, gutenberg_id: int) -> dict:
"""Fetch and parse Gutenberg RDF metadata.
Returns dict with:
- title: Book title
- author: Author name
- author_birth_year: Author birth year (if available)
- author_death_year: Author death year (if available)
- issued_date: Gutenberg release date
- subjects: List of subjects
- description: Book description
- text_urls: List of text/plain file URLs from hasFormat
- html_urls: List of text/html file URLs from hasFormat
"""
rdf_url = f"https://www.gutenberg.org/ebooks/{gutenberg_id}.rdf"
try:
response = requests.get(rdf_url, timeout=15)
if response.status_code != 200:
return None
rdf_text = response.text
metadata = {}
# Extract title
title_match = re.search(r'<dcterms:title>([^<]+)</dcterms:title>', rdf_text)
if title_match:
metadata['title'] = title_match.group(1).strip()
# Extract author name
author_match = re.search(r'<pgterms:name>([^<]+)</pgterms:name>', rdf_text)
if author_match:
metadata['author'] = author_match.group(1).strip()