TuftsIntroDH · bbobb · Apr 14, 2025 · Apr 14, 2025 · Apr 14, 2025
diff --git a/08_tf-idf.ipynb b/08_tf-idf.ipynb
@@ -46,9 +46,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting lxml\n",
+      "  Downloading lxml-5.3.2-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.6 kB)\n",
+      "Downloading lxml-5.3.2-cp312-cp312-manylinux_2_28_x86_64.whl (5.0 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m21.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hInstalling collected packages: lxml\n",
+      "Successfully installed lxml-5.3.2\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
    "source": [
     "%pip install lxml"
    ]
@@ -64,7 +78,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -87,7 +101,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -105,7 +119,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -129,9 +143,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "xml/tlg0012/tlg002/tlg0012.tlg002.perseus-eng3.xml\n",
+      "xml/tlg0012/tlg002/tlg0012.tlg002.perseus-eng4.xml\n",
+      "xml/tlg0012/tlg003/tlg0012.tlg003.perseus-eng1.xml\n",
+      "xml/tlg0012/tlg001/tlg0012.tlg001.perseus-eng3.xml\n",
+      "xml/tlg0012/tlg001/tlg0012.tlg001.perseus-eng4.xml\n"
+     ]
+    }
+   ],
    "source": [
     "for file in files:\n",
     "    # print the name of the file as a sanity check\n",
@@ -193,9 +219,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: nltk in /home/codespace/.local/lib/python3.12/site-packages (3.9.1)\n",
+      "Requirement already satisfied: click in /home/codespace/.local/lib/python3.12/site-packages (from nltk) (8.1.8)\n",
+      "Requirement already satisfied: joblib in /home/codespace/.local/lib/python3.12/site-packages (from nltk) (1.4.2)\n",
+      "Requirement already satisfied: regex>=2021.8.3 in /home/codespace/.local/lib/python3.12/site-packages (from nltk) (2024.11.6)\n",
+      "Requirement already satisfied: tqdm in /home/codespace/.local/lib/python3.12/site-packages (from nltk) (4.67.1)\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
    "source": [
     "%pip install nltk"
    ]
@@ -211,9 +250,31 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package punkt to /home/codespace/nltk_data...\n",
+      "[nltk_data]   Unzipping tokenizers/punkt.zip.\n",
+      "[nltk_data] Downloading package punkt_tab to\n",
+      "[nltk_data]     /home/codespace/nltk_data...\n",
+      "[nltk_data]   Unzipping tokenizers/punkt_tab.zip.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "import nltk\n",
     "\n",
@@ -235,9 +296,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "There are 175611 tokens in tlg0012.tlg001.perseus-eng4.txt.\n",
+      "There are 152631 tokens in tlg0012.tlg002.perseus-eng3.txt.\n",
+      "There are 135463 tokens in tlg0012.tlg002.perseus-eng4.txt.\n",
+      "There are 200625 tokens in tlg0012.tlg001.perseus-eng3.txt.\n"
+     ]
+    }
+   ],
    "source": [
     "# Initialize the tokenizer\n",
     "from nltk.tokenize import word_tokenize\n",
@@ -283,7 +355,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -311,9 +383,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "tokenized_texts[\"tlg0012.tlg001.perseus-eng3.txt\"][\"counts\"][\"odysseus\"]"
    ]
@@ -330,9 +413,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "ZeroDivisionError",
+     "evalue": "division by zero",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mZeroDivisionError\u001b[39m                         Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[14]\u001b[39m\u001b[32m, line 20\u001b[39m\n\u001b[32m     16\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mmath\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m log10\n\u001b[32m     18\u001b[39m n_docs = \u001b[38;5;28mlen\u001b[39m(tokenized_texts.keys())\n\u001b[32m---> \u001b[39m\u001b[32m20\u001b[39m idf_achilles = log10(\u001b[43mn_docs\u001b[49m\u001b[43m \u001b[49m\u001b[43m/\u001b[49m\u001b[43m \u001b[49m\u001b[43mdf_achilles\u001b[49m)\n\u001b[32m     21\u001b[39m idf_odysseus = log10(n_docs / df_odysseus)\n\u001b[32m     23\u001b[39m \u001b[38;5;28mprint\u001b[39m(idf_achilles)\n",
+      "\u001b[31mZeroDivisionError\u001b[39m: division by zero"
+     ]
+    }
+   ],
    "source": [
     "df_achilles = 0\n",
     "df_odysseus = 0\n",
@@ -361,9 +456,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'idf_achilles' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mNameError\u001b[39m                                 Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[15]\u001b[39m\u001b[32m, line 16\u001b[39m\n\u001b[32m     12\u001b[39m tf_odysseus = values[\u001b[33m'\u001b[39m\u001b[33mcounts\u001b[39m\u001b[33m'\u001b[39m][\u001b[33m'\u001b[39m\u001b[33modysseus\u001b[39m\u001b[33m'\u001b[39m] / total_terms\n\u001b[32m     14\u001b[39m \u001b[38;5;66;03m# Remember, the simplest version of TF-IDF is just\u001b[39;00m\n\u001b[32m     15\u001b[39m \u001b[38;5;66;03m# TF * 1/DF\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m16\u001b[39m tf_idf_achilles = tf_achilles * \u001b[43midf_achilles\u001b[49m\n\u001b[32m     17\u001b[39m tf_idf_odysseus = tf_odysseus * idf_odysseus\n\u001b[32m     19\u001b[39m \u001b[38;5;66;03m# Now we can report on the statistics for this file\u001b[39;00m\n",
+      "\u001b[31mNameError\u001b[39m: name 'idf_achilles' is not defined"
+     ]
+    }
+   ],
    "source": [
     "# Now let's calculate the TF-IDF \"score\" for each term in each document.\n",
     "\n",
@@ -420,9 +527,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 16,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{1, 2, 3}"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "my_list = [1, 1, 2, 3, 3]\n",
     "\n",
@@ -438,9 +556,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 18,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'tlg0012.tlg001.perseus-eng4.txt': set(),\n",
+       " 'tlg0012.tlg002.perseus-eng3.txt': set(),\n",
+       " 'tlg0012.tlg002.perseus-eng4.txt': set(),\n",
+       " 'tlg0012.tlg001.perseus-eng3.txt': set()}"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "non_universal_terms = {}\n",
     "\n",