TuftsIntroDH · ckean02 · Apr 2, 2025 · Apr 15, 2025 · Apr 24, 2025 · Apr 24, 2025
diff --git a/10_apis.ipynb b/10_apis.ipynb
diff --git a/11_nlp.ipynb b/11_nlp.ipynb
diff --git a/PHlesson.ipynb b/PHlesson.ipynb
@@ -0,0 +1,111 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "b51c15e4",
+   "metadata": {},
+   "source": [
+    "Finding Distinctive Words with TF-IDF in Scientific Abstracts\n",
+    "\n",
+    "We'll apply TF-IDF to scientific abstracts, to extract the critical information addressed in each excerpt. This is a handy tool for sorting through original literature sources.\n",
+    "\n",
+    "TF-IDF weighs words based on two factors:\n",
+    "-- Term Frequency (TF): How often a word appears in a document, normalized by the document's length (gathers words relative importance within the document)\n",
+    "-- Inverse Document Frequency (IDF): How rare the word is across all documents. Words that appear in many documents (like \"the\" or \"study\") receive lower IDF scores. Specialized terms receive higher scores.\n",
+    "\n",
+    "TF-IDF aims to highlight words that are frequent in a specific document and relatively unique across the corpus. TF-IDF helps with:\n",
+    "- Identifying field-specific terminology in academic literature\n",
+    "- Extracting keywords from documents\n",
+    "- Building search engines that return relevant results\n",
+    "- Comparing document similarity\n",
+    "- Text classification and clustering\n",
+    "\n",
+    "Coding: \n",
+    "\n",
+    "pythonimport nltk\n",
+    "from nltk.tokenize import word_tokenize\n",
+    "from nltk.corpus import stopwords\n",
+    "from collections import Counter\n",
+    "from math import log\n",
+    "\n",
+    "# Download NLTK resources\n",
+    "nltk.download('punkt')\n",
+    "nltk.download('stopwords')\n",
+    "\n",
+    "# Sample abstracts from different scientific fields\n",
+    "abstracts = [\n",
+    "    {\"field\": \"immunology\", \"text\": \"Study of antibodies and vaccines in immune response.\"},\n",
+    "    {\"field\": \"immunology\", \"text\": \"Research on vaccine effects on antibody production.\"},\n",
+    "    {\"field\": \"neuroscience\", \"text\": \"Brain activity during memory formation and recall.\"},\n",
+    "    {\"field\": \"neuroscience\", \"text\": \"Neural pathways and dopamine in the brain.\"}\n",
+    "]\n",
+    "\n",
+    "# We need to tokenize first to get rid of non-unique terms\n",
+    "processed_docs = []\n",
+    "for doc in abstracts:\n",
+    "    # Convert to lowercase and tokenize\n",
+    "    tokens = word_tokenize(doc[\"text\"].lower())\n",
+    "    \n",
+    "    # Remove stopwords\n",
+    "    stop_words = set(stopwords.words('english'))\n",
+    "    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]\n",
+    "    \n",
+    "    processed_docs.append({\"field\": doc[\"field\"], \"tokens\": tokens})\n",
+    "\n",
+    "# Calculate Term Frequency (TF) \n",
+    "# We count how many times each word appears in a single document and divide by the total number of words. This normalization prevents longer documents from having artificially higher term frequencies.\n",
+    "for doc in processed_docs:\n",
+    "    word_counts = Counter(doc[\"tokens\"])\n",
+    "    total_words = len(doc[\"tokens\"])\n",
+    "    doc[\"tf\"] = {word: count/total_words for word, count in word_counts.items()}\n",
+    "\n",
+    "# Calculate Inverse Document Frequency (IDF)\n",
+    "# We count how many documents contain each word. Words that appear in many documents will have high document frequency but low inverse document frequency. We compute log(N/DF) where N is the total number of documents. The logarithm dampens the effect for very rare words.\n",
+    "all_words = set()\n",
+    "for doc in processed_docs:\n",
+    "    all_words.update(doc[\"tokens\"])\n",
+    "\n",
+    "total_docs = len(processed_docs)\n",
+    "idf = {}\n",
+    "for word in all_words:\n",
+    "    doc_count = sum(1 for doc in processed_docs if word in doc[\"tokens\"])\n",
+    "    idf[word] = log(total_docs / doc_count)\n",
+    "\n",
+    "# Calculate TF-IDF\n",
+    "# Multiply each term's TF by its IDF to get a combined score. Words with high TF-IDF appear frequently in a specific document but rarely in the overall corpus.\n",
+    "for doc in processed_docs:\n",
+    "    doc[\"tfidf\"] = {word: tf_value * idf[word] for word, tf_value in doc[\"tf\"].items()}\n",
+    "\n",
+    "# Find top terms by field\n",
+    "field_terms = {}\n",
+    "for doc in processed_docs:\n",
+    "    field = doc[\"field\"]\n",
+    "    if field not in field_terms:\n",
+    "        field_terms[field] = {}\n",
+    "    \n",
+    "    for word, score in doc[\"tfidf\"].items():\n",
+    "        field_terms[field][word] = field_terms[field].get(word, 0) + score\n",
+    "\n",
+    "# Print top terms for each field\n",
+    "for field, terms in field_terms.items():\n",
+    "    top_terms = sorted(terms.items(), key=lambda x: x[1], reverse=True)[:3]\n",
+    "    print(f\"\\nTop terms for {field}:\")\n",
+    "    for term, score in top_terms:\n",
+    "        print(f\"  {term}: {score:.4f}\")\n",
+    "\n",
+    "We group documents by their field and sum the TF-IDF scores for each word across documents in the same field to find field-specific terminology.\n",
+    "\n",
+    "Interpreting Results\n",
+    "The output shows the most distinctive terms for each field. For immunology, you might see terms like \"antibodies\" and \"vaccines\" with high scores. For neuroscience, terms like \"brain\" and \"neural\" would score highly.\n",
+    "These high-scoring terms represent the specialized vocabulary that characterizes each field, even though our sample is very small."
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/ex.ipynb b/ex.ipynb
@@ -0,0 +1,19 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2f716d93",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/ex1.py b/ex1.py
diff --git a/pythonpractice.ipynb b/pythonpractice.ipynb
@@ -0,0 +1,171 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "10c41abc",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "hello world\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"hello world\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "12a2de18",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def greet_user(username:str):\n",
+    "    \"\"\"Display a simple greeting.\"\"\"\n",
+    "    return f\"Hello, {username}!\" #intercalating\n",
+    "\n",
+    "#have to call with one argument, return value instead of printed string, can use in other functions\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8b2ce278",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def print_user_greeting(username: str):\n",
+    "    print(greet_user(username)) #print \"Hello, Charles\"\n",
+    "    \n",
+    "    #functions are evaluated inside out"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "d8bc85c4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def print_return(username: str):\n",
+    "    print(print_user_greeting(username))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "97f5c079",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Hello, Catie!\n",
+      "None\n"
+     ]
+    }
+   ],
+   "source": [
+    "print_return(\"Catie\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "cec3bdfd",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Hello, Catie!\n"
+     ]
+    }
+   ],
+   "source": [
+    "print_user_greeting(\"Catie\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "e1c0b45a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Hello, Catie!'"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "greet_user(\"Catie\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "2628d25a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def describe_pet(animal_type: str, pet_name: str):\n",
+    "    return f\"My {animal_type} is {pet_name.title()}.\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "88e79391",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'My dog is Lila.'"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "describe_pet(\"dog\", \"Lila\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}