|
46 | 46 | }, |
47 | 47 | { |
48 | 48 | "cell_type": "code", |
49 | | - "execution_count": 1, |
| 49 | + "execution_count": null, |
50 | 50 | "metadata": {}, |
51 | | - "outputs": [ |
52 | | - { |
53 | | - "name": "stdout", |
54 | | - "output_type": "stream", |
55 | | - "text": [ |
56 | | - "Collecting lxml\n", |
57 | | - " Downloading lxml-5.3.1-cp313-cp313-manylinux_2_28_x86_64.whl.metadata (3.7 kB)\n", |
58 | | - "Downloading lxml-5.3.1-cp313-cp313-manylinux_2_28_x86_64.whl (5.0 MB)\n", |
59 | | - "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m11.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m31m12.4 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\n", |
60 | | - "\u001b[?25hInstalling collected packages: lxml\n", |
61 | | - "Successfully installed lxml-5.3.1\n", |
62 | | - "Note: you may need to restart the kernel to use updated packages.\n" |
63 | | - ] |
64 | | - } |
65 | | - ], |
| 51 | + "outputs": [], |
66 | 52 | "source": [ |
67 | 53 | "%pip install lxml" |
68 | 54 | ] |
|
78 | 64 | }, |
79 | 65 | { |
80 | 66 | "cell_type": "code", |
81 | | - "execution_count": 2, |
| 67 | + "execution_count": null, |
82 | 68 | "metadata": {}, |
83 | 69 | "outputs": [], |
84 | 70 | "source": [ |
|
101 | 87 | }, |
102 | 88 | { |
103 | 89 | "cell_type": "code", |
104 | | - "execution_count": 9, |
| 90 | + "execution_count": null, |
105 | 91 | "metadata": {}, |
106 | 92 | "outputs": [], |
107 | 93 | "source": [ |
|
119 | 105 | }, |
120 | 106 | { |
121 | 107 | "cell_type": "code", |
122 | | - "execution_count": 4, |
| 108 | + "execution_count": null, |
123 | 109 | "metadata": {}, |
124 | 110 | "outputs": [], |
125 | 111 | "source": [ |
|
143 | 129 | }, |
144 | 130 | { |
145 | 131 | "cell_type": "code", |
146 | | - "execution_count": 10, |
| 132 | + "execution_count": null, |
147 | 133 | "metadata": {}, |
148 | | - "outputs": [ |
149 | | - { |
150 | | - "name": "stdout", |
151 | | - "output_type": "stream", |
152 | | - "text": [ |
153 | | - "xml/tlg0012/tlg001/tlg0012.tlg001.perseus-eng3.xml\n", |
154 | | - "xml/tlg0012/tlg001/tlg0012.tlg001.perseus-eng4.xml\n", |
155 | | - "xml/tlg0012/tlg001/tlg0012.tlg001.perseus-grc2.xml\n", |
156 | | - "xml/tlg0012/tlg002/tlg0012.tlg002.perseus-eng3.xml\n", |
157 | | - "xml/tlg0012/tlg002/tlg0012.tlg002.perseus-eng4.xml\n", |
158 | | - "xml/tlg0012/tlg002/tlg0012.tlg002.perseus-grc2.xml\n", |
159 | | - "xml/tlg0012/tlg003/tlg0012.tlg003.perseus-eng1.xml\n", |
160 | | - "xml/tlg0012/tlg003/tlg0012.tlg003.perseus-grc1.xml\n" |
161 | | - ] |
162 | | - } |
163 | | - ], |
| 134 | + "outputs": [], |
164 | 135 | "source": [ |
165 | 136 | "for file in files:\n", |
166 | 137 | " # print the name of the file as a sanity check\n", |
|
222 | 193 | }, |
223 | 194 | { |
224 | 195 | "cell_type": "code", |
225 | | - "execution_count": 6, |
| 196 | + "execution_count": null, |
226 | 197 | "metadata": {}, |
227 | | - "outputs": [ |
228 | | - { |
229 | | - "name": "stdout", |
230 | | - "output_type": "stream", |
231 | | - "text": [ |
232 | | - "Collecting nltk\n", |
233 | | - " Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)\n", |
234 | | - "Collecting click (from nltk)\n", |
235 | | - " Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)\n", |
236 | | - "Collecting joblib (from nltk)\n", |
237 | | - " Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)\n", |
238 | | - "Collecting regex>=2021.8.3 (from nltk)\n", |
239 | | - " Downloading regex-2024.11.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)\n", |
240 | | - "Collecting tqdm (from nltk)\n", |
241 | | - " Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)\n", |
242 | | - "Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)\n", |
243 | | - "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m12.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", |
244 | | - "\u001b[?25hDownloading regex-2024.11.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (796 kB)\n", |
245 | | - "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m796.9/796.9 kB\u001b[0m \u001b[31m17.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", |
246 | | - "\u001b[?25hDownloading click-8.1.8-py3-none-any.whl (98 kB)\n", |
247 | | - "Downloading joblib-1.4.2-py3-none-any.whl (301 kB)\n", |
248 | | - "Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)\n", |
249 | | - "Installing collected packages: tqdm, regex, joblib, click, nltk\n", |
250 | | - "Successfully installed click-8.1.8 joblib-1.4.2 nltk-3.9.1 regex-2024.11.6 tqdm-4.67.1\n", |
251 | | - "Note: you may need to restart the kernel to use updated packages.\n" |
252 | | - ] |
253 | | - } |
254 | | - ], |
| 198 | + "outputs": [], |
255 | 199 | "source": [ |
256 | 200 | "%pip install nltk" |
257 | 201 | ] |
|
267 | 211 | }, |
268 | 212 | { |
269 | 213 | "cell_type": "code", |
270 | | - "execution_count": 13, |
| 214 | + "execution_count": null, |
271 | 215 | "metadata": {}, |
272 | | - "outputs": [ |
273 | | - { |
274 | | - "name": "stderr", |
275 | | - "output_type": "stream", |
276 | | - "text": [ |
277 | | - "[nltk_data] Downloading package punkt to /home/charles/nltk_data...\n", |
278 | | - "[nltk_data] Package punkt is already up-to-date!\n", |
279 | | - "[nltk_data] Downloading package punkt_tab to\n", |
280 | | - "[nltk_data] /home/charles/nltk_data...\n", |
281 | | - "[nltk_data] Unzipping tokenizers/punkt_tab.zip.\n" |
282 | | - ] |
283 | | - }, |
284 | | - { |
285 | | - "data": { |
286 | | - "text/plain": [ |
287 | | - "True" |
288 | | - ] |
289 | | - }, |
290 | | - "execution_count": 13, |
291 | | - "metadata": {}, |
292 | | - "output_type": "execute_result" |
293 | | - } |
294 | | - ], |
| 216 | + "outputs": [], |
295 | 217 | "source": [ |
296 | 218 | "import nltk\n", |
297 | 219 | "\n", |
|
313 | 235 | }, |
314 | 236 | { |
315 | 237 | "cell_type": "code", |
316 | | - "execution_count": 17, |
| 238 | + "execution_count": null, |
317 | 239 | "metadata": {}, |
318 | | - "outputs": [ |
319 | | - { |
320 | | - "name": "stdout", |
321 | | - "output_type": "stream", |
322 | | - "text": [ |
323 | | - "There are 200625 tokens in tlg0012.tlg001.perseus-eng3.txt.\n", |
324 | | - "There are 175611 tokens in tlg0012.tlg001.perseus-eng4.txt.\n", |
325 | | - "There are 152631 tokens in tlg0012.tlg002.perseus-eng3.txt.\n", |
326 | | - "There are 135463 tokens in tlg0012.tlg002.perseus-eng4.txt.\n" |
327 | | - ] |
328 | | - } |
329 | | - ], |
| 240 | + "outputs": [], |
330 | 241 | "source": [ |
331 | 242 | "# Initialize the tokenizer\n", |
332 | 243 | "from nltk.tokenize import word_tokenize\n", |
|
372 | 283 | }, |
373 | 284 | { |
374 | 285 | "cell_type": "code", |
375 | | - "execution_count": 18, |
| 286 | + "execution_count": null, |
376 | 287 | "metadata": {}, |
377 | 288 | "outputs": [], |
378 | 289 | "source": [ |
|
400 | 311 | }, |
401 | 312 | { |
402 | 313 | "cell_type": "code", |
403 | | - "execution_count": 19, |
| 314 | + "execution_count": null, |
404 | 315 | "metadata": {}, |
405 | | - "outputs": [ |
406 | | - { |
407 | | - "data": { |
408 | | - "text/plain": [ |
409 | | - "128" |
410 | | - ] |
411 | | - }, |
412 | | - "execution_count": 19, |
413 | | - "metadata": {}, |
414 | | - "output_type": "execute_result" |
415 | | - } |
416 | | - ], |
| 316 | + "outputs": [], |
417 | 317 | "source": [ |
418 | 318 | "tokenized_texts[\"tlg0012.tlg001.perseus-eng3.txt\"][\"counts\"][\"odysseus\"]" |
419 | 319 | ] |
|
430 | 330 | }, |
431 | 331 | { |
432 | 332 | "cell_type": "code", |
433 | | - "execution_count": 29, |
| 333 | + "execution_count": null, |
434 | 334 | "metadata": {}, |
435 | | - "outputs": [ |
436 | | - { |
437 | | - "name": "stdout", |
438 | | - "output_type": "stream", |
439 | | - "text": [ |
440 | | - "0.0\n" |
441 | | - ] |
442 | | - } |
443 | | - ], |
| 335 | + "outputs": [], |
444 | 336 | "source": [ |
445 | 337 | "df_achilles = 0\n", |
446 | 338 | "df_odysseus = 0\n", |
|
469 | 361 | }, |
470 | 362 | { |
471 | 363 | "cell_type": "code", |
472 | | - "execution_count": 32, |
| 364 | + "execution_count": null, |
473 | 365 | "metadata": {}, |
474 | | - "outputs": [ |
475 | | - { |
476 | | - "name": "stdout", |
477 | | - "output_type": "stream", |
478 | | - "text": [ |
479 | | - "In tlg0012.tlg001.perseus-eng3.txt:\n", |
480 | | - "TF of achilles: 0.002043613707165109\n", |
481 | | - "TF of odysseus: 0.000638006230529595\n", |
482 | | - "TF-IDF of achilles: 0.0\n", |
483 | | - "TF-IDF of odysseus: 0.0\n", |
484 | | - "\n", |
485 | | - "In tlg0012.tlg001.perseus-eng4.txt:\n", |
486 | | - "TF of achilles: 0.002403038534032606\n", |
487 | | - "TF of odysseus: 0.0007061061095261686\n", |
488 | | - "TF-IDF of achilles: 0.0\n", |
489 | | - "TF-IDF of odysseus: 0.0\n", |
490 | | - "\n", |
491 | | - "In tlg0012.tlg002.perseus-eng3.txt:\n", |
492 | | - "TF of achilles: 0.0001048279838302835\n", |
493 | | - "TF of odysseus: 0.0041603606082643765\n", |
494 | | - "TF-IDF of achilles: 0.0\n", |
495 | | - "TF-IDF of odysseus: 0.0\n", |
496 | | - "\n", |
497 | | - "In tlg0012.tlg002.perseus-eng4.txt:\n", |
498 | | - "TF of achilles: 0.0001254955227626732\n", |
499 | | - "TF of odysseus: 0.0042816119530794386\n", |
500 | | - "TF-IDF of achilles: 0.0\n", |
501 | | - "TF-IDF of odysseus: 0.0\n", |
502 | | - "\n" |
503 | | - ] |
504 | | - } |
505 | | - ], |
| 366 | + "outputs": [], |
506 | 367 | "source": [ |
507 | 368 | "# Now let's calculate the TF-IDF \"score\" for each term in each document.\n", |
508 | 369 | "\n", |
|
559 | 420 | }, |
560 | 421 | { |
561 | 422 | "cell_type": "code", |
562 | | - "execution_count": 33, |
| 423 | + "execution_count": null, |
563 | 424 | "metadata": {}, |
564 | | - "outputs": [ |
565 | | - { |
566 | | - "data": { |
567 | | - "text/plain": [ |
568 | | - "{1, 2, 3}" |
569 | | - ] |
570 | | - }, |
571 | | - "execution_count": 33, |
572 | | - "metadata": {}, |
573 | | - "output_type": "execute_result" |
574 | | - } |
575 | | - ], |
| 425 | + "outputs": [], |
576 | 426 | "source": [ |
577 | 427 | "my_list = [1, 1, 2, 3, 3]\n", |
578 | 428 | "\n", |
|
590 | 440 | "cell_type": "code", |
591 | 441 | "execution_count": null, |
592 | 442 | "metadata": {}, |
593 | | - "outputs": [ |
594 | | - { |
595 | | - "data": { |
596 | | - "text/plain": [ |
597 | | - "2286" |
598 | | - ] |
599 | | - }, |
600 | | - "execution_count": 37, |
601 | | - "metadata": {}, |
602 | | - "output_type": "execute_result" |
603 | | - } |
604 | | - ], |
| 443 | + "outputs": [], |
605 | 444 | "source": [ |
606 | 445 | "non_universal_terms = {}\n", |
607 | 446 | "\n", |
|
0 commit comments