Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 93 additions & 26 deletions lab-functional-programming/your-code/Q1.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -19,48 +19,82 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 19,
"metadata": {},
"outputs": [],
"outputs": [
{
"ename": "NameError",
"evalue": "name 'corpus' is not defined",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
"Input \u001b[1;32mIn [19]\u001b[0m, in \u001b[0;36m<cell line: 20>\u001b[1;34m()\u001b[0m\n\u001b[0;32m 22\u001b[0m s\u001b[38;5;241m=\u001b[39mf\u001b[38;5;241m.\u001b[39mread()\n\u001b[0;32m 23\u001b[0m s\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(i \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m s \u001b[38;5;28;01mif\u001b[39;00m i \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m punctuation)\n\u001b[1;32m---> 24\u001b[0m \u001b[43mcorpus\u001b[49m\u001b[38;5;241m.\u001b[39mappend(s\u001b[38;5;241m.\u001b[39mlower())\n\u001b[0;32m 25\u001b[0m f\u001b[38;5;241m.\u001b[39mclose()\n\u001b[0;32m 28\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 29\u001b[0m \u001b[38;5;124;03mLoop `corpus`. Append the terms in each doc into the `bag_of_words` array. The terms in `bag_of_words` \u001b[39;00m\n\u001b[0;32m 30\u001b[0m \u001b[38;5;124;03mshould be unique which means before adding each term you need to check if it's already added to the array.\u001b[39;00m\n\u001b[0;32m 31\u001b[0m \u001b[38;5;124;03mIn addition, check if each term is in the `stop_words` array. Only append the term to `bag_of_words`\u001b[39;00m\n\u001b[0;32m 32\u001b[0m \u001b[38;5;124;03mif it is not a stop word.\u001b[39;00m\n\u001b[0;32m 33\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n",
"\u001b[1;31mNameError\u001b[0m: name 'corpus' is not defined"
]
}
],
"source": [
"# Import required libraries\n",
"\n",
"\n",
"from string import punctuation\n",
"\n",
"\n",
"# Define function\n",
"def get_bow_from_docs(docs, stop_words=[]):\n",
" \n",
"\n",
" # In the function, first define the variables you will use such as `corpus`, `bag_of_words`, and `term_freq`.\n",
" \n",
" \n",
" \n",
" corpus=[]\n",
" bag_of_words=[]\n",
" term_freq=[]\n",
" docs=['doc1.txt', 'doc2.txt', 'doc3.txt'] \n",
"\n",
" \"\"\"\n",
" Loop `docs` and read the content of each doc into a string in `corpus`.\n",
" Remember to convert the doc content to lowercases and remove punctuation.\n",
" \"\"\"\n",
"for doc in docs:\n",
" f=open(doc,'r')\n",
" s=f.read()\n",
" s=''.join(i for i in s if i not in punctuation)\n",
" corpus.append(s.lower())\n",
" f.close()\n",
"\n",
" \n",
" \n",
" \"\"\"\n",
" Loop `corpus`. Append the terms in each doc into the `bag_of_words` array. The terms in `bag_of_words` \n",
" should be unique which means before adding each term you need to check if it's already added to the array.\n",
" In addition, check if each term is in the `stop_words` array. Only append the term to `bag_of_words`\n",
" if it is not a stop word.\n",
" \"\"\"\n",
"\n",
" \n",
"for s in corpus:\n",
" s=''.join(i for i in s if i not in punctuation)\n",
" terms=s.split()\n",
" for temr in terms:\n",
" if not term in bag_of_words and not term in stop_words:\n",
" bag_of_words.append(term)\n",
" \n",
" \n",
" \"\"\"\n",
" Loop `corpus` again. For each doc string, count the number of occurrences of each term in `bag_of_words`. \n",
" Create an array for each doc's term frequency and append it to `term_freq`.\n",
" \"\"\"\n",
"\n",
" \n",
"for s in corpus:\n",
" freq=[]\n",
" terms=s.split()\n",
"for word in bag_of_words:\n",
" freq.append(terms.count(word))\n",
" term_freq.append(freq)\n",
" \n",
" # Now return your output as an object\n",
" return {\n",
" \"bag_of_words\": bag_of_words,\n",
"return {\n",
" \"bag_of_words\": bag_of_words,\n",
" \"term_freq\": term_freq\n",
" }\n",
" }\n",
" \n",
" "
]
},
Expand All @@ -75,15 +109,29 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 15,
"metadata": {},
"outputs": [],
"outputs": [
{
"ename": "NameError",
"evalue": "name 'get_bow_from_docs' is not defined",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
"Input \u001b[1;32mIn [15]\u001b[0m, in \u001b[0;36m<cell line: 5>\u001b[1;34m()\u001b[0m\n\u001b[0;32m 2\u001b[0m docs \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdoc1.txt\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdoc2.txt\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdoc3.txt\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[0;32m 4\u001b[0m \u001b[38;5;66;03m# Obtain BoW from your function\u001b[39;00m\n\u001b[1;32m----> 5\u001b[0m bow \u001b[38;5;241m=\u001b[39m \u001b[43mget_bow_from_docs\u001b[49m([ \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m../../lab-functional-programming/your-code/doc1.txt\u001b[39m\u001b[38;5;124m'\u001b[39m, \n\u001b[0;32m 6\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m../../lab-functional-programming//your-code/doc2.txt\u001b[39m\u001b[38;5;124m'\u001b[39m, \n\u001b[0;32m 7\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m../../lab-functional-programming//your-code/doc3.txt\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[0;32m 9\u001b[0m \u001b[38;5;66;03m# Print BoW\u001b[39;00m\n\u001b[0;32m 10\u001b[0m \u001b[38;5;28mprint\u001b[39m(bow)\n",
"\u001b[1;31mNameError\u001b[0m: name 'get_bow_from_docs' is not defined"
]
}
],
"source": [
"# Define doc paths array\n",
"docs = []\n",
"docs = ['doc1.txt', 'doc2.txt', 'doc3.txt']\n",
"\n",
"# Obtain BoW from your function\n",
"bow = get_bow_from_docs(docs)\n",
"bow = get_bow_from_docs([ '../../lab-functional-programming/your-code/doc1.txt', \n",
" '../../lab-functional-programming//your-code/doc2.txt', \n",
" '../../lab-functional-programming//your-code/doc3.txt'])\n",
"\n",
"# Print BoW\n",
"print(bow)"
Expand All @@ -100,12 +148,20 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 12,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"frozenset({'were', 'he', 'been', 'then', 'except', 'de', 'couldnt', 'again', 'where', 'against', 'interest', 'somewhere', 'whom', 'bottom', 'therefore', 'whereas', 'therein', 'done', 'no', 'my', 'put', 'via', 'else', 'on', 'each', 'neither', 'behind', 'over', 'whose', 'mostly', 'who', 'off', 'yourselves', 'before', 'sometime', 'co', 'describe', 'wherever', 'everything', 'full', 'namely', 'might', 'afterwards', 'whoever', 'anyone', 'fire', 'also', 'others', 'whole', 'their', 'beyond', 'ten', 'about', 'twenty', 'himself', 'hence', 'fill', 'sometimes', 'am', 'all', 'keep', 'not', 'nevertheless', 'up', 'rather', 'how', 'ours', 'formerly', 'both', 'further', 'though', 'herself', 'sincere', 'latterly', 'him', 'than', 'within', 'would', 'someone', 'fifty', 'five', 'show', 'of', 'after', 'thence', 'amoungst', 'hereby', 'these', 'as', 'will', 'nine', 'call', 'already', 'least', 'yourself', 'during', 'ever', 'nor', 'should', 'may', 'seem', 'serious', 'amount', 'that', 'could', 'i', 'in', 'few', 'anyway', 'becoming', 'noone', 'either', 'very', 'anywhere', 'much', 'its', 'go', 'whenever', 'from', 'here', 'mine', 'more', 'but', 'alone', 'us', 'into', 'together', 'now', 'our', 'found', 'meanwhile', 'toward', 'beforehand', 'an', 'has', 'what', 'never', 'had', 'while', 'because', 'a', 'hereafter', 'must', 'three', 'several', 'somehow', 'forty', 'those', 'everyone', 'above', 'anyhow', 'for', 'fifteen', 'bill', 'thick', 'at', 'can', 'do', 'still', 'latter', 'if', 'part', 'since', 'inc', 'perhaps', 'none', 'eg', 'such', 'there', 'became', 'eight', 'cry', 'was', 'top', 'yours', 'four', 'made', 'once', 'eleven', 'thus', 'by', 'name', 'per', 'almost', 'something', 'myself', 'without', 'thereupon', 'enough', 'beside', 'thru', 'being', 'itself', 'his', 'same', 'find', 'themselves', 'it', 'often', 'cannot', 'side', 'however', 'elsewhere', 'seems', 'give', 'they', 'next', 'you', 'even', 'see', 'thereby', 'please', 'indeed', 'amongst', 'get', 'whereby', 'otherwise', 'thereafter', 'to', 'back', 'cant', 'have', 'ltd', 'hasnt', 'the', 'through', 'nobody', 'hers', 'thin', 'detail', 'moreover', 'out', 'etc', 'move', 'why', 'be', 'twelve', 'down', 'one', 'other', 'third', 'me', 'every', 'although', 'everywhere', 'throughout', 'ourselves', 'ie', 'anything', 'whatever', 'which', 'seeming', 'become', 'most', 'hundred', 'whereafter', 'them', 'so', 'whither', 'last', 'whether', 'is', 'upon', 'or', 'seemed', 'her', 'towards', 'always', 'are', 'first', 'any', 'wherein', 'well', 'across', 'too', 'yet', 'nothing', 'herein', 're', 'con', 'we', 'around', 'many', 'whereupon', 'mill', 'another', 'becomes', 'empty', 'some', 'six', 'un', 'take', 'besides', 'sixty', 'until', 'nowhere', 'along', 'among', 'less', 'front', 'former', 'under', 'with', 'when', 'between', 'onto', 'only', 'this', 'whence', 'and', 'your', 'due', 'two', 'system', 'hereupon', 'own', 'below', 'she'})\n"
]
}
],
"source": [
"from sklearn.feature_extraction import stop_words\n",
"print(stop_words.ENGLISH_STOP_WORDS)"
"from sklearn.feature_extraction import _stop_words\n",
"print(_stop_words.ENGLISH_STOP_WORDS)"
]
},
{
Expand All @@ -128,11 +184,22 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 17,
"metadata": {},
"outputs": [],
"outputs": [
{
"ename": "SyntaxError",
"evalue": "invalid syntax (2582812775.py, line 3)",
"output_type": "error",
"traceback": [
"\u001b[1;36m Input \u001b[1;32mIn [17]\u001b[1;36m\u001b[0m\n\u001b[1;33m '../../lab-functional-programming//your-code/doc3.txt'] _stop_words.ENGLISH_STOP_WORDS)\u001b[0m\n\u001b[1;37m ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m invalid syntax\n"
]
}
],
"source": [
"bow = get_bow_from_docs(bow, stop_words.ENGLISH_STOP_WORDS)\n",
"bow = get_bow_from_docs([ '../../lab-functional-programming/your-code/doc1.txt', \n",
" '../../lab-functional-programming//your-code/doc2.txt', \n",
" '../../lab-functional-programming//your-code/doc3.txt'] _stop_words.ENGLISH_STOP_WORDS)\n",
"\n",
"print(bow)"
]
Expand All @@ -156,7 +223,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
Expand All @@ -170,9 +237,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}
71 changes: 61 additions & 10 deletions lab-functional-programming/your-code/Q2.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,27 @@
},
{
"cell_type": "code",
"execution_count": 60,
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"# Define your string handling functions below\n",
"# Minimal 3 functions\n"
"# Minimal 3 functions\n",
"import re\n",
"from string import punctuation\n",
"\n",
"def strip_html_tags(html):\n",
" s = re.sub(\"<!--.*?-->\", \"\", html)\n",
" s = re.sub(\"<.*?>\", \"\", s)\n",
" return(s)\n",
"\n",
"\n",
"def remove_punctuation(html):\n",
" s = ''.join(i for i in html if i not in punctuation)\n",
" return(s)\n",
"\n",
"def to_lower_case(html):\n",
" return(html.lower())\n"
]
},
{
Expand All @@ -32,7 +47,7 @@
},
{
"cell_type": "code",
"execution_count": 61,
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -42,6 +57,28 @@
" bag_of_words = []\n",
" term_freq = []\n",
" \n",
" for doc in docs:\n",
" f = open(doc, 'r')\n",
" s = f.read()\n",
" s = strip_html_tags(s)\n",
" s = remove_punctuation(s)\n",
" s = to_lower_case(s)\n",
" corpus.append(s)\n",
" f.close()\n",
" \n",
" for s in corpus:\n",
" s = ''.join(i for i in s if i not in punctuation)\n",
" terms = s.split()\n",
" for term in terms:\n",
" if not term in bag_of_words and not term in stop_words:\n",
" bag_of_words.append(term)\n",
" \n",
" for s in corpus:\n",
" freq = []\n",
" terms = s.split()\n",
" for word in bag_of_words:\n",
" freq.append(terms.count(word))\n",
" term_freq.append(freq)\n",
" # write your codes here\n",
" \n",
" return {\n",
Expand All @@ -60,17 +97,31 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 15,
"metadata": {},
"outputs": [],
"outputs": [
{
"ename": "UnicodeDecodeError",
"evalue": "'charmap' codec can't decode byte 0x9d in position 167699: character maps to <undefined>",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mUnicodeDecodeError\u001b[0m Traceback (most recent call last)",
"Input \u001b[1;32mIn [15]\u001b[0m, in \u001b[0;36m<cell line: 2>\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mfeature_extraction\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m _stop_words\n\u001b[1;32m----> 2\u001b[0m bow \u001b[38;5;241m=\u001b[39m \u001b[43mget_bow_from_docs\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\n\u001b[0;32m 3\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mwww.coursereport.com_ironhack.html\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 4\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43men.wikipedia.org_Data_analysis.html\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mwww.lipsum.com.html\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\n\u001b[0;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[43m_stop_words\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mENGLISH_STOP_WORDS\u001b[49m\n\u001b[0;32m 8\u001b[0m \u001b[43m)\u001b[49m\n\u001b[0;32m 10\u001b[0m \u001b[38;5;28mprint\u001b[39m(bow)\n",
"Input \u001b[1;32mIn [11]\u001b[0m, in \u001b[0;36mget_bow_from_docs\u001b[1;34m(docs, stop_words)\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m docs:\n\u001b[0;32m 8\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(doc, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m----> 9\u001b[0m s \u001b[38;5;241m=\u001b[39m \u001b[43mf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 10\u001b[0m s \u001b[38;5;241m=\u001b[39m strip_html_tags(s)\n\u001b[0;32m 11\u001b[0m s \u001b[38;5;241m=\u001b[39m remove_punctuation(s)\n",
"File \u001b[1;32m~\\anaconda4\\lib\\encodings\\cp1252.py:23\u001b[0m, in \u001b[0;36mIncrementalDecoder.decode\u001b[1;34m(self, input, final)\u001b[0m\n\u001b[0;32m 22\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecode\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m, final\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m):\n\u001b[1;32m---> 23\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcodecs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcharmap_decode\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\u001b[43mdecoding_table\u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;241m0\u001b[39m]\n",
"\u001b[1;31mUnicodeDecodeError\u001b[0m: 'charmap' codec can't decode byte 0x9d in position 167699: character maps to <undefined>"
]
}
],
"source": [
"from sklearn.feature_extraction import stop_words\n",
"from sklearn.feature_extraction import _stop_words\n",
"bow = get_bow_from_docs([\n",
" 'www.coursereport.com_ironhack.html',\n",
" 'en.wikipedia.org_Data_analysis.html',\n",
" 'www.lipsum.com.html'\n",
" ],\n",
" stop_words.ENGLISH_STOP_WORDS\n",
" _stop_words.ENGLISH_STOP_WORDS\n",
")\n",
"\n",
"print(bow)"
Expand All @@ -97,7 +148,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
Expand All @@ -111,9 +162,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}
Loading