From 7759a170dda00070ea936a00c3c7fcca2f2ec868 Mon Sep 17 00:00:00 2001 From: miagia27 Date: Wed, 26 Mar 2025 14:08:33 +0000 Subject: [PATCH 1/2] tf-idf.ipynb --- tf-idf.ipynb | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tf-idf.ipynb b/tf-idf.ipynb index 6f041cc..f6492c1 100644 --- a/tf-idf.ipynb +++ b/tf-idf.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -12,7 +12,7 @@ "Collecting lxml\n", " Downloading lxml-5.3.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.7 kB)\n", "Downloading lxml-5.3.1-cp312-cp312-manylinux_2_28_x86_64.whl (5.0 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m50.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m41.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hInstalling collected packages: lxml\n", "Successfully installed lxml-5.3.1\n", "\n", @@ -28,7 +28,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -38,7 +38,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -47,7 +47,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -62,18 +62,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "tlg0012/tlg001/tlg0012.tlg001.perseus-eng3.xml\n", - "tlg0012/tlg001/tlg0012.tlg001.perseus-eng4.xml\n", "tlg0012/tlg003/tlg0012.tlg003.perseus-eng1.xml\n", + "tlg0012/tlg002/tlg0012.tlg002.perseus-eng3.xml\n", "tlg0012/tlg002/tlg0012.tlg002.perseus-eng4.xml\n", - "tlg0012/tlg002/tlg0012.tlg002.perseus-eng3.xml\n" + "tlg0012/tlg001/tlg0012.tlg001.perseus-eng3.xml\n", + "tlg0012/tlg001/tlg0012.tlg001.perseus-eng4.xml\n" ] } ], @@ -96,7 +96,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -116,7 +116,7 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -125,7 +125,7 @@ "4" ] }, - "execution_count": 71, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } From a7a275a1c3d3ecf1d395853999f2d232ee477583 Mon Sep 17 00:00:00 2001 From: miagia27 Date: Thu, 27 Mar 2025 02:51:07 +0000 Subject: [PATCH 2/2] ted-idf.ipynb --- tf-idf.ipynb | 225 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 211 insertions(+), 14 deletions(-) diff --git a/tf-idf.ipynb b/tf-idf.ipynb index f6492c1..97ed9c6 100644 --- a/tf-idf.ipynb +++ b/tf-idf.ipynb @@ -2,19 +2,14 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Collecting lxml\n", - " Downloading lxml-5.3.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.7 kB)\n", - "Downloading lxml-5.3.1-cp312-cp312-manylinux_2_28_x86_64.whl (5.0 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m41.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hInstalling collected packages: lxml\n", - "Successfully installed lxml-5.3.1\n", + "Requirement already satisfied: lxml in /home/codespace/.python/current/lib/python3.12/site-packages (5.3.1)\n", "\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.0.1\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", @@ -28,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -38,7 +33,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -47,7 +42,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -62,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -96,7 +91,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -116,7 +111,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -125,7 +120,7 @@ "4" ] }, - "execution_count": 7, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -141,6 +136,208 @@ "\n", "df_ulysses" ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "ename": "IndentationError", + "evalue": "expected an indented block after 'with' statement on line 7 (3658932252.py, line 8)", + "output_type": "error", + "traceback": [ + "\u001b[0;36m Cell \u001b[0;32mIn[13], line 8\u001b[0;36m\u001b[0m\n\u001b[0;31m text = f.read().lower()\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mIndentationError\u001b[0m\u001b[0;31m:\u001b[0m expected an indented block after 'with' statement on line 7\n" + ] + } + ], + "source": [ + "from TEI_NS import word_tokenize\n", + "tokenized_texts = {}\n", + "text_files = Path(\".\").glob(\"tlg0012.tlg00*.perseus-eng[1-4].txt\")\n", + "for file in text_files:\n", + " name = str(file)\n", + "\n", + " with open(file) as f:\n", + " text = f.read().lower()\n", + " tokens = word_tokenize(text)\n", + " print(f\"There are {len(tokens)} tokens in {name}.\")\n", + " tokenized_texts[name] = tokens" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'tokenized_texts' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[14], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mcollections\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Counter\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m filename, tokens \u001b[38;5;129;01min\u001b[39;00m \u001b[43mtokenized_texts\u001b[49m\u001b[38;5;241m.\u001b[39mitems():\n\u001b[1;32m 3\u001b[0m counts \u001b[38;5;241m=\u001b[39m Counter(tokens)\n\u001b[1;32m 5\u001b[0m tokenized_texts[filename] \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtokens\u001b[39m\u001b[38;5;124m\"\u001b[39m: tokens, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcounts\u001b[39m\u001b[38;5;124m\"\u001b[39m: counts}\n", + "\u001b[0;31mNameError\u001b[0m: name 'tokenized_texts' is not defined" + ] + } + ], + "source": [ + "from collections import Counter\n", + "for filename, tokens in tokenized_texts.items():\n", + " counts = Counter(tokens)\n", + "\n", + " tokenized_texts[filename] = {\"tokens\": tokens, \"counts\": counts}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'tokenized_texts' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[15], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mtokenized_texts\u001b[49m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtlg0012.tlg001.perseus-eng3.txt\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcounts\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124modysseus\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n", + "\u001b[0;31mNameError\u001b[0m: name 'tokenized_texts' is not defined" + ] + } + ], + "source": [ + "tokenized_texts[\"tlg0012.tlg001.perseus-eng3.txt\"][\"counts\"][\"odysseus\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'tokenized_texts' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[16], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m df_achilles \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 2\u001b[0m df_odysseus \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m filename, values \u001b[38;5;129;01min\u001b[39;00m \u001b[43mtokenized_texts\u001b[49m\u001b[38;5;241m.\u001b[39mitems():\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124modysseus\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m values[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcounts\u001b[39m\u001b[38;5;124m'\u001b[39m]:\n\u001b[1;32m 6\u001b[0m df_odysseus \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n", + "\u001b[0;31mNameError\u001b[0m: name 'tokenized_texts' is not defined" + ] + } + ], + "source": [ + "df_achilles = 0\n", + "df_odysseus = 0\n", + "\n", + "for filename, values in tokenized_texts.items():\n", + " if \"odysseus\" in values['counts']:\n", + " df_odysseus += 1\n", + " \n", + " if \"achilles\" in values[\"counts\"]:\n", + " df_achilles += 1\n", + "\n", + "from math import log10\n", + "\n", + "n_docs = len(tokenized_texts.keys())\n", + "\n", + "idf_achilles = log10(n_docs / df_achilles)\n", + "idf_odysseus = log10(n_docs / df_odysseus)\n", + "\n", + "print(idf_achilles)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "ename": "SyntaxError", + "evalue": "unterminated triple-quoted f-string literal (detected at line 7) (695929936.py, line 7)", + "output_type": "error", + "traceback": [ + "\u001b[0;36m Cell \u001b[0;32mIn[5], line 7\u001b[0;36m\u001b[0m\n\u001b[0;31m print(f\"\"\"In {filename}:\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m unterminated triple-quoted f-string literal (detected at line 7)\n" + ] + } + ], + "source": [ + "for filename, values in tokenized_texts.items():\n", + " total_terms = len(values['tokens'])\n", + " tf_achilles = values['counts']['achilles'] / total_terms\n", + " tf_odysseus = values['counts']['odysseus'] / total_terms\n", + " tf_idf_achilles = tf_achilles * idf_achilles\n", + " tf_idf_odysseus = tf_odysseus * idf_odysseus\n", + " print(f\"\"\"In {filename}:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'Achilles', 'Atreus'}" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "my_list = ['Atreus', 'Achilles']\n", + "\n", + "set(my_list)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'tokenized_texts' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[3], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m non_universal_terms \u001b[38;5;241m=\u001b[39m {}\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m filename, values \u001b[38;5;129;01min\u001b[39;00m \u001b[43mtokenized_texts\u001b[49m\u001b[38;5;241m.\u001b[39mitems():\n\u001b[1;32m 4\u001b[0m my_set \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m(values[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcounts\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mkeys())\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m other_file, other_values \u001b[38;5;129;01min\u001b[39;00m tokenized_texts\u001b[38;5;241m.\u001b[39mitems():\n", + "\u001b[0;31mNameError\u001b[0m: name 'tokenized_texts' is not defined" + ] + } + ], + "source": [ + "non_universal_terms = {}\n", + "\n", + "for filename, values in tokenized_texts.items():\n", + " my_set = set(values['counts'].keys())\n", + "\n", + " for other_file, other_values in tokenized_texts.items():\n", + " if other_file != filename:\n", + " my_set -= set(other_values['counts'].keys())\n", + "\n", + " non_universal_terms[filename] = my_set\n", + "\n", + "non_universal_terms" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# F-IDF is really telling us about a given term frequency in each text and within the corpus as a whole.\n", + "\n", + "# TF-IDF might be useful in historical or legal documents where finding key terms would be helpful to known more about an event \n", + "# or clauses that are unique to a case. " + ] } ], "metadata": {