Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
231 changes: 214 additions & 17 deletions tf-idf.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,14 @@
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting lxml\n",
" Downloading lxml-5.3.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.7 kB)\n",
"Downloading lxml-5.3.1-cp312-cp312-manylinux_2_28_x86_64.whl (5.0 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m50.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hInstalling collected packages: lxml\n",
"Successfully installed lxml-5.3.1\n",
"Requirement already satisfied: lxml in /home/codespace/.python/current/lib/python3.12/site-packages (5.3.1)\n",
"\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.0.1\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
Expand All @@ -28,7 +23,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -38,7 +33,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -47,7 +42,7 @@
},
{
"cell_type": "code",
"execution_count": 58,
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -62,18 +57,18 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tlg0012/tlg001/tlg0012.tlg001.perseus-eng3.xml\n",
"tlg0012/tlg001/tlg0012.tlg001.perseus-eng4.xml\n",
"tlg0012/tlg003/tlg0012.tlg003.perseus-eng1.xml\n",
"tlg0012/tlg002/tlg0012.tlg002.perseus-eng3.xml\n",
"tlg0012/tlg002/tlg0012.tlg002.perseus-eng4.xml\n",
"tlg0012/tlg002/tlg0012.tlg002.perseus-eng3.xml\n"
"tlg0012/tlg001/tlg0012.tlg001.perseus-eng3.xml\n",
"tlg0012/tlg001/tlg0012.tlg001.perseus-eng4.xml\n"
]
}
],
Expand All @@ -96,7 +91,7 @@
},
{
"cell_type": "code",
"execution_count": 68,
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -116,7 +111,7 @@
},
{
"cell_type": "code",
"execution_count": 71,
"execution_count": 12,
"metadata": {},
"outputs": [
{
Expand All @@ -125,7 +120,7 @@
"4"
]
},
"execution_count": 71,
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -141,6 +136,208 @@
"\n",
"df_ulysses"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"ename": "IndentationError",
"evalue": "expected an indented block after 'with' statement on line 7 (3658932252.py, line 8)",
"output_type": "error",
"traceback": [
"\u001b[0;36m Cell \u001b[0;32mIn[13], line 8\u001b[0;36m\u001b[0m\n\u001b[0;31m text = f.read().lower()\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mIndentationError\u001b[0m\u001b[0;31m:\u001b[0m expected an indented block after 'with' statement on line 7\n"
]
}
],
"source": [
"from TEI_NS import word_tokenize\n",
"tokenized_texts = {}\n",
"text_files = Path(\".\").glob(\"tlg0012.tlg00*.perseus-eng[1-4].txt\")\n",
"for file in text_files:\n",
" name = str(file)\n",
"\n",
" with open(file) as f:\n",
" text = f.read().lower()\n",
" tokens = word_tokenize(text)\n",
" print(f\"There are {len(tokens)} tokens in {name}.\")\n",
" tokenized_texts[name] = tokens"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'tokenized_texts' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[14], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mcollections\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Counter\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m filename, tokens \u001b[38;5;129;01min\u001b[39;00m \u001b[43mtokenized_texts\u001b[49m\u001b[38;5;241m.\u001b[39mitems():\n\u001b[1;32m 3\u001b[0m counts \u001b[38;5;241m=\u001b[39m Counter(tokens)\n\u001b[1;32m 5\u001b[0m tokenized_texts[filename] \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtokens\u001b[39m\u001b[38;5;124m\"\u001b[39m: tokens, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcounts\u001b[39m\u001b[38;5;124m\"\u001b[39m: counts}\n",
"\u001b[0;31mNameError\u001b[0m: name 'tokenized_texts' is not defined"
]
}
],
"source": [
"from collections import Counter\n",
"for filename, tokens in tokenized_texts.items():\n",
" counts = Counter(tokens)\n",
"\n",
" tokenized_texts[filename] = {\"tokens\": tokens, \"counts\": counts}\n"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'tokenized_texts' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[15], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mtokenized_texts\u001b[49m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtlg0012.tlg001.perseus-eng3.txt\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcounts\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124modysseus\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n",
"\u001b[0;31mNameError\u001b[0m: name 'tokenized_texts' is not defined"
]
}
],
"source": [
"tokenized_texts[\"tlg0012.tlg001.perseus-eng3.txt\"][\"counts\"][\"odysseus\"]"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'tokenized_texts' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[16], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m df_achilles \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 2\u001b[0m df_odysseus \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m filename, values \u001b[38;5;129;01min\u001b[39;00m \u001b[43mtokenized_texts\u001b[49m\u001b[38;5;241m.\u001b[39mitems():\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124modysseus\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m values[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcounts\u001b[39m\u001b[38;5;124m'\u001b[39m]:\n\u001b[1;32m 6\u001b[0m df_odysseus \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n",
"\u001b[0;31mNameError\u001b[0m: name 'tokenized_texts' is not defined"
]
}
],
"source": [
"df_achilles = 0\n",
"df_odysseus = 0\n",
"\n",
"for filename, values in tokenized_texts.items():\n",
" if \"odysseus\" in values['counts']:\n",
" df_odysseus += 1\n",
" \n",
" if \"achilles\" in values[\"counts\"]:\n",
" df_achilles += 1\n",
"\n",
"from math import log10\n",
"\n",
"n_docs = len(tokenized_texts.keys())\n",
"\n",
"idf_achilles = log10(n_docs / df_achilles)\n",
"idf_odysseus = log10(n_docs / df_odysseus)\n",
"\n",
"print(idf_achilles)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"ename": "SyntaxError",
"evalue": "unterminated triple-quoted f-string literal (detected at line 7) (695929936.py, line 7)",
"output_type": "error",
"traceback": [
"\u001b[0;36m Cell \u001b[0;32mIn[5], line 7\u001b[0;36m\u001b[0m\n\u001b[0;31m print(f\"\"\"In {filename}:\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m unterminated triple-quoted f-string literal (detected at line 7)\n"
]
}
],
"source": [
"for filename, values in tokenized_texts.items():\n",
" total_terms = len(values['tokens'])\n",
" tf_achilles = values['counts']['achilles'] / total_terms\n",
" tf_odysseus = values['counts']['odysseus'] / total_terms\n",
" tf_idf_achilles = tf_achilles * idf_achilles\n",
" tf_idf_odysseus = tf_odysseus * idf_odysseus\n",
" print(f\"\"\"In {filename}:"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'Achilles', 'Atreus'}"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"my_list = ['Atreus', 'Achilles']\n",
"\n",
"set(my_list)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'tokenized_texts' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[3], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m non_universal_terms \u001b[38;5;241m=\u001b[39m {}\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m filename, values \u001b[38;5;129;01min\u001b[39;00m \u001b[43mtokenized_texts\u001b[49m\u001b[38;5;241m.\u001b[39mitems():\n\u001b[1;32m 4\u001b[0m my_set \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m(values[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcounts\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mkeys())\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m other_file, other_values \u001b[38;5;129;01min\u001b[39;00m tokenized_texts\u001b[38;5;241m.\u001b[39mitems():\n",
"\u001b[0;31mNameError\u001b[0m: name 'tokenized_texts' is not defined"
]
}
],
"source": [
"non_universal_terms = {}\n",
"\n",
"for filename, values in tokenized_texts.items():\n",
" my_set = set(values['counts'].keys())\n",
"\n",
" for other_file, other_values in tokenized_texts.items():\n",
" if other_file != filename:\n",
" my_set -= set(other_values['counts'].keys())\n",
"\n",
" non_universal_terms[filename] = my_set\n",
"\n",
"non_universal_terms"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# F-IDF is really telling us about a given term frequency in each text and within the corpus as a whole.\n",
"\n",
"# TF-IDF might be useful in historical or legal documents where finding key terms would be helpful to known more about an event \n",
"# or clauses that are unique to a case. "
]
}
],
"metadata": {
Expand Down