Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
180 changes: 156 additions & 24 deletions 08_tf-idf.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,23 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting lxml\n",
" Downloading lxml-5.3.2-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.6 kB)\n",
"Downloading lxml-5.3.2-cp312-cp312-manylinux_2_28_x86_64.whl (5.0 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m21.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hInstalling collected packages: lxml\n",
"Successfully installed lxml-5.3.2\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"%pip install lxml"
]
Expand All @@ -64,7 +78,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -87,7 +101,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -105,7 +119,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -129,9 +143,21 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"xml/tlg0012/tlg002/tlg0012.tlg002.perseus-eng3.xml\n",
"xml/tlg0012/tlg002/tlg0012.tlg002.perseus-eng4.xml\n",
"xml/tlg0012/tlg003/tlg0012.tlg003.perseus-eng1.xml\n",
"xml/tlg0012/tlg001/tlg0012.tlg001.perseus-eng3.xml\n",
"xml/tlg0012/tlg001/tlg0012.tlg001.perseus-eng4.xml\n"
]
}
],
"source": [
"for file in files:\n",
" # print the name of the file as a sanity check\n",
Expand Down Expand Up @@ -193,9 +219,22 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: nltk in /home/codespace/.local/lib/python3.12/site-packages (3.9.1)\n",
"Requirement already satisfied: click in /home/codespace/.local/lib/python3.12/site-packages (from nltk) (8.1.8)\n",
"Requirement already satisfied: joblib in /home/codespace/.local/lib/python3.12/site-packages (from nltk) (1.4.2)\n",
"Requirement already satisfied: regex>=2021.8.3 in /home/codespace/.local/lib/python3.12/site-packages (from nltk) (2024.11.6)\n",
"Requirement already satisfied: tqdm in /home/codespace/.local/lib/python3.12/site-packages (from nltk) (4.67.1)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"%pip install nltk"
]
Expand All @@ -211,9 +250,31 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/codespace/nltk_data...\n",
"[nltk_data] Unzipping tokenizers/punkt.zip.\n",
"[nltk_data] Downloading package punkt_tab to\n",
"[nltk_data] /home/codespace/nltk_data...\n",
"[nltk_data] Unzipping tokenizers/punkt_tab.zip.\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import nltk\n",
"\n",
Expand All @@ -235,9 +296,20 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 9,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"There are 175611 tokens in tlg0012.tlg001.perseus-eng4.txt.\n",
"There are 152631 tokens in tlg0012.tlg002.perseus-eng3.txt.\n",
"There are 135463 tokens in tlg0012.tlg002.perseus-eng4.txt.\n",
"There are 200625 tokens in tlg0012.tlg001.perseus-eng3.txt.\n"
]
}
],
"source": [
"# Initialize the tokenizer\n",
"from nltk.tokenize import word_tokenize\n",
Expand Down Expand Up @@ -283,7 +355,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -311,9 +383,20 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 12,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenized_texts[\"tlg0012.tlg001.perseus-eng3.txt\"][\"counts\"][\"odysseus\"]"
]
Expand All @@ -330,9 +413,21 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 14,
"metadata": {},
"outputs": [],
"outputs": [
{
"ename": "ZeroDivisionError",
"evalue": "division by zero",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mZeroDivisionError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[14]\u001b[39m\u001b[32m, line 20\u001b[39m\n\u001b[32m 16\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mmath\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m log10\n\u001b[32m 18\u001b[39m n_docs = \u001b[38;5;28mlen\u001b[39m(tokenized_texts.keys())\n\u001b[32m---> \u001b[39m\u001b[32m20\u001b[39m idf_achilles = log10(\u001b[43mn_docs\u001b[49m\u001b[43m \u001b[49m\u001b[43m/\u001b[49m\u001b[43m \u001b[49m\u001b[43mdf_achilles\u001b[49m)\n\u001b[32m 21\u001b[39m idf_odysseus = log10(n_docs / df_odysseus)\n\u001b[32m 23\u001b[39m \u001b[38;5;28mprint\u001b[39m(idf_achilles)\n",
"\u001b[31mZeroDivisionError\u001b[39m: division by zero"
]
}
],
"source": [
"df_achilles = 0\n",
"df_odysseus = 0\n",
Expand Down Expand Up @@ -361,9 +456,21 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 15,
"metadata": {},
"outputs": [],
"outputs": [
{
"ename": "NameError",
"evalue": "name 'idf_achilles' is not defined",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[15]\u001b[39m\u001b[32m, line 16\u001b[39m\n\u001b[32m 12\u001b[39m tf_odysseus = values[\u001b[33m'\u001b[39m\u001b[33mcounts\u001b[39m\u001b[33m'\u001b[39m][\u001b[33m'\u001b[39m\u001b[33modysseus\u001b[39m\u001b[33m'\u001b[39m] / total_terms\n\u001b[32m 14\u001b[39m \u001b[38;5;66;03m# Remember, the simplest version of TF-IDF is just\u001b[39;00m\n\u001b[32m 15\u001b[39m \u001b[38;5;66;03m# TF * 1/DF\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m16\u001b[39m tf_idf_achilles = tf_achilles * \u001b[43midf_achilles\u001b[49m\n\u001b[32m 17\u001b[39m tf_idf_odysseus = tf_odysseus * idf_odysseus\n\u001b[32m 19\u001b[39m \u001b[38;5;66;03m# Now we can report on the statistics for this file\u001b[39;00m\n",
"\u001b[31mNameError\u001b[39m: name 'idf_achilles' is not defined"
]
}
],
"source": [
"# Now let's calculate the TF-IDF \"score\" for each term in each document.\n",
"\n",
Expand Down Expand Up @@ -420,9 +527,20 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 16,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"{1, 2, 3}"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"my_list = [1, 1, 2, 3, 3]\n",
"\n",
Expand All @@ -438,9 +556,23 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 18,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"{'tlg0012.tlg001.perseus-eng4.txt': set(),\n",
" 'tlg0012.tlg002.perseus-eng3.txt': set(),\n",
" 'tlg0012.tlg002.perseus-eng4.txt': set(),\n",
" 'tlg0012.tlg001.perseus-eng3.txt': set()}"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"non_universal_terms = {}\n",
"\n",
Expand Down
Loading