diff --git a/lab-functional-programming/your-code/.ipynb_checkpoints/Q1-checkpoint.ipynb b/lab-functional-programming/your-code/.ipynb_checkpoints/Q1-checkpoint.ipynb new file mode 100644 index 0000000..0f2a4d0 --- /dev/null +++ b/lab-functional-programming/your-code/.ipynb_checkpoints/Q1-checkpoint.ipynb @@ -0,0 +1,225 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the cell below, create a Python function that wraps your previous solution for the Bag of Words lab.\n", + "\n", + "Requirements:\n", + "\n", + "1. Your function should accept the following parameters:\n", + " * `docs` [REQUIRED] - array of document paths.\n", + " * `stop_words` [OPTIONAL] - array of stop words. The default value is an empty array.\n", + "\n", + "1. Your function should return a Python object that contains the following:\n", + " * `bag_of_words` - array of strings of normalized unique words in the corpus.\n", + " * `term_freq` - array of the term-frequency vectors." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'bag_of_words': ['is',\n", + " 'student',\n", + " 'cool',\n", + " 'a',\n", + " 'love',\n", + " 'i',\n", + " 'ironhack',\n", + " 'at',\n", + " 'am'],\n", + " 'term_freq': [[1, 0, 1, 0, 0, 0, 1, 0, 0],\n", + " [0, 0, 0, 0, 1, 1, 1, 0, 0],\n", + " [0, 1, 0, 1, 0, 1, 1, 1, 1]]}" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Import required libraries\n", + "import re\n", + "# Define function\n", + "docs = ['doc1.txt', 'doc2.txt', 'doc3.txt']\n", + "\n", + "def get_bow_from_docs(docs, stop_words=[]):\n", + " corpus = []\n", + " bag_of_words = set()\n", + " term_freq = []\n", + " for doc in docs:\n", + " with open(doc, \"r\") as f:\n", + " text = f.read()\n", + " doc_string = re.split(r'[,\\.\\n\\s]', text)\n", + " doc_string_no_spaces = [i.lower() for i in doc_string if i != \"\"]\n", + " corpus.append(doc_string_no_spaces)\n", + " for vector in corpus: \n", + " for word in vector:\n", + " if word not in stop_words:\n", + " bag_of_words.add(word)\n", + " final_bag_of_words = list(bag_of_words)\n", + " for vector in corpus:\n", + " vector_freq = []\n", + " for word in final_bag_of_words:\n", + " vector_freq.append(vector.count(word))\n", + " term_freq.append(vector_freq)\n", + " return {\n", + " \"bag_of_words\": final_bag_of_words,\n", + " \"term_freq\": term_freq\n", + " }\n", + "\n", + "\n", + "# In the function, first define the variables you will use such as `corpus`, `bag_of_words`, and `term_freq`.\n", + "\n", + " \n", + "# Now return your output as an object\n", + "\n", + "get_bow_from_docs(docs, stop_words=[])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Test your function without stop words. You should see the output like below:\n", + "\n", + "```{'bag_of_words': ['ironhack', 'is', 'cool', 'i', 'love', 'am', 'a', 'student', 'at'], 'term_freq': [[1, 1, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 1, 1, 0, 0, 0, 0], [1, 0, 0, 1, 0, 1, 1, 1, 1]]}```" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'bag_of_words': ['is', 'student', 'cool', 'a', 'love', 'i', 'ironhack', 'at', 'am'], 'term_freq': [[1, 0, 1, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 1, 1, 1, 0, 0], [0, 1, 0, 1, 0, 1, 1, 1, 1]]}\n" + ] + } + ], + "source": [ + "# Define doc paths array\n", + "\n", + "# Obtain BoW from your function\n", + "bow = get_bow_from_docs(docs, stop_words=[])\n", + "\n", + "# Print BoW\n", + "print(bow)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If your attempt above is successful, nice work done!\n", + "\n", + "Now test your function again with the stop words. In the previous lab we defined the stop words in a large array. In this lab, we'll import the stop words from Scikit-Learn." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "frozenset({'however', 'perhaps', 'seems', 'whoever', 'hence', 'ours', 'also', 'almost', 'former', 'onto', 'call', 'wherever', 'enough', 'twenty', 'nothing', 'thick', 'less', 'first', 'more', 'where', 'etc', 'hers', 'move', 'herself', 'have', 'my', 'can', 'whence', 'below', 'thereby', 'what', 'others', 'therefore', 'she', 'of', 'everywhere', 'because', 'get', 'forty', 'after', 'around', 'if', 'myself', 'side', 'cannot', 'namely', 'along', 'i', 'besides', 'bottom', 'amoungst', 'than', 'us', 'become', 'co', 'had', 'being', 'himself', 'whether', 'noone', 'anyone', 'same', 'up', 'always', 'un', 'together', 'bill', 'beside', 'his', 'formerly', 'when', 'him', 'ever', 'within', 'from', 'further', 'nevertheless', 'whose', 'before', 'too', 'most', 'part', 'it', 'down', 'but', 'please', 'six', 'their', 'interest', 'wherein', 'was', 'per', 'sincere', 'beyond', 'by', 'name', 'indeed', 'many', 'been', 'we', 'here', 'ten', 'no', 'the', 'cry', 'these', 'see', 'eight', 'mostly', 'somewhere', 'anything', 'are', 'elsewhere', 'everyone', 'twelve', 'who', 'due', 'both', 'hereafter', 'several', 'nine', 'keep', 'them', 'may', 'none', 'would', 'against', 'sometimes', 'somehow', 'beforehand', 'whereafter', 'top', 'yet', 'upon', 'full', 'could', 'take', 'describe', 'even', 'amongst', 'that', 'own', 'still', 'as', 'above', 'whom', 'already', 'were', 'thereafter', 'yourself', 'find', 'he', 'nor', 'between', 'done', 'cant', 'empty', 'meanwhile', 'whither', 'while', 'fill', 'anywhere', 'with', 'yourselves', 'over', 'serious', 'thin', 'nowhere', 'there', 'until', 'thru', 'every', 'whatever', 'once', 'four', 'during', 'afterwards', 'some', 'became', 'via', 'whole', 'me', 'hereupon', 'nobody', 'so', 'throughout', 'someone', 'those', 'couldnt', 'whereas', 'your', 'go', 'any', 'two', 'at', 'back', 'either', 'has', 'mill', 'seeming', 'least', 'all', 'one', 'our', 'behind', 'something', 'well', 'for', 'again', 'moreover', 'found', 'sixty', 'not', 'eleven', 'seem', 'an', 'you', 'or', 'about', 'amount', 'how', 'thus', 'latterly', 'across', 'through', 'they', 'very', 'inc', 'show', 'alone', 'to', 'this', 'must', 'among', 'will', 'themselves', 'hereby', 'ie', 'fire', 'anyway', 'sometime', 'fifty', 'de', 'front', 'should', 'only', 'then', 'which', 'detail', 'though', 'made', 'three', 'on', 'ltd', 'last', 'itself', 'mine', 'into', 'few', 'her', 'never', 'therein', 'another', 'becomes', 'although', 'under', 'its', 'otherwise', 'yours', 'five', 'am', 'seemed', 'and', 'system', 'other', 'out', 're', 'is', 'con', 'latter', 'often', 'much', 'everything', 'put', 'rather', 'a', 'becoming', 'fifteen', 'each', 'in', 'since', 'thereupon', 'now', 'except', 'else', 'give', 'without', 'whenever', 'why', 'towards', 'might', 'anyhow', 'hasnt', 'ourselves', 'whereby', 'herein', 'eg', 'next', 'do', 'thence', 'off', 'third', 'hundred', 'neither', 'toward', 'such', 'whereupon', 'be'})\n" + ] + } + ], + "source": [ + "from sklearn.feature_extraction import _stop_words\n", + "print(_stop_words.ENGLISH_STOP_WORDS)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You should have seen a large list of words that looks like:\n", + "\n", + "```frozenset({'across', 'mine', 'cannot', ...})```\n", + "\n", + "`frozenset` is a type of Python object that is immutable. In this lab you can use it just like an array without conversion." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, test your function with supplying `stop_words.ENGLISH_STOP_WORDS` as the second parameter." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'bag_of_words': ['love', 'student', 'cool', 'ironhack'], 'term_freq': [[0, 0, 1, 1], [1, 0, 0, 1], [0, 1, 0, 1]]}\n" + ] + } + ], + "source": [ + "bow = get_bow_from_docs(docs, _stop_words.ENGLISH_STOP_WORDS)\n", + "\n", + "print(bow)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You should have seen:\n", + "\n", + "```{'bag_of_words': ['ironhack', 'cool', 'love', 'student'], 'term_freq': [[1, 1, 0, 0], [1, 0, 1, 0], [1, 0, 0, 1]]}```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "___________________________________________________________________________________________________________________________" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/lab-functional-programming/your-code/.ipynb_checkpoints/Q2-checkpoint.ipynb b/lab-functional-programming/your-code/.ipynb_checkpoints/Q2-checkpoint.ipynb new file mode 100644 index 0000000..94e26e5 --- /dev/null +++ b/lab-functional-programming/your-code/.ipynb_checkpoints/Q2-checkpoint.ipynb @@ -0,0 +1,152 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we want to enhance the `get_bow_from_docs` function so that it will work with HTML webpages. In HTML, there are a lot of messy codes such as HTML tags, Javascripts, [unicodes](https://www.w3schools.com/charsets/ref_utf_misc_symbols.asp) that will mess up your bag of words. We need to clean up those junk before generating BoW.\n", + "\n", + "Next, what you will do is to define several new functions each of which is specialized to clean up the HTML codes in one aspect. For instance, you can have a `strip_html_tags` function to remove all HTML tags, a `remove_punctuation` function to remove all punctuation, a `to_lower_case` function to convert string to lowercase, and a `remove_unicode` function to remove all unicodes.\n", + "\n", + "Then in your `get_bow_from_doc` function, you will call each of those functions you created to clean up the HTML before you generate the corpus.\n", + "\n", + "Note: Please use Python string operations and regular expression only in this lab. Do not use extra libraries such as `beautifulsoup` because otherwise you loose the purpose of practicing." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import re \n", + "# Define your string handling functions below\n", + "# Minimal 3 functions\n", + "\n", + "def remove_spaces(text):\n", + " pattern = r\"\\s\"\n", + " words_no_spaces = re.split(pattern, text)\n", + " return words_no_spaces\n", + "\n", + "def remove_punctuation(words_no_spaces):\n", + " pattern = r\"[\\.,:;]\"\n", + " words_no_punctuation = [re.sub(pattern, \"\", word_no_spaces) for word_no_spaces in words_no_spaces]\n", + " return words_no_punctuation \n", + " \n", + "def to_lower_case(words_no_punctuation):\n", + " words_lower_case = [word_no_punctuation.lower() for word_no_punctuation in words_no_punctuation]\n", + " return words_lower_case" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, paste your previously written `get_bow_from_docs` function below. Call your functions above at the appropriate place." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def get_bow_from_docs(docs, stop_words=[]):\n", + " # In the function, first define the variables you will use such as `corpus`, `bag_of_words`, and `term_freq`.\n", + " corpus = []\n", + " bag_of_words = set()\n", + " term_freq = []\n", + " \n", + " for doc in docs:\n", + " with open(doc, \"r\", encoding = \"utf-8\") as f:\n", + " text = f.read()\n", + " words_no_spaces = remove_spaces(text)\n", + " words_no_punctuation = remove_punctuation(words_no_spaces)\n", + " words_lower_case = to_lower_case(words_no_punctuation)\n", + " corpus.append(words_lower_case)\n", + " \n", + " for vector in corpus: \n", + " for word in vector:\n", + " if word not in stop_words:\n", + " bag_of_words.add(word)\n", + " final_bag_of_words = list(bag_of_words)\n", + " \n", + " for vector in corpus:\n", + " vector_freq = []\n", + " for word in final_bag_of_words:\n", + " vector_freq.append(vector.count(word))\n", + " term_freq.append(vector_freq)\n", + " \n", + " return {\n", + " \"bag_of_words\": final_bag_of_words,\n", + " \"term_freq\": term_freq\n", + " }\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, read the content from the three HTML webpages in the `your-codes` directory to test your function." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'bag_of_words': ['', 'signed', 'remaining', 'floatright\"', 'bootcamp?
', \"teacher's\", 'aliquam', 'visualizationit\\'s', '(60-70%)', '22)', 'class=\"next\">2007', 'columns', 'href=\"mailtoliz@coursereportcom?subject=flagged%3a%20ironhack%20%7c%20an%20incredible%20experience%21%20%7c%20id%3a%2016276\">flag', 'href=\"#cite_ref-footnoteadèr2008a344-345_30-0\">^', 'href=\"/w/indexphp?title=data_analysis&action=edit&section=36\"', 'data-request-id=\"16341\"', 'things', 'id=\"cite_ref-24\"', 'project', 'convince', 'alt=\"ironhack-student-typing\"', 'href=\"#cite_note-nehme_2016-09-29-40\">⎴]
want', 'clarify\">
where', 'title=\"analytics\">analytics', 'after
', 'end', 'data-file-height=\"566\"', 'href=\"/cities/barcelona\">barcelonareview', 'january', '
why', 'href=\"/wiki/riemann_solver\"', 'equally', 'reading', 'target=\"_blank\">', 'traction', 'src=\"//uploadwikimediaorg/wikipedia/commons/thumb/7/7e/us_phillips_curve_2000_to_2013png/250px-us_phillips_curve_2000_to_2013png\"', 'href=\"/wiki/opinion\"', 'everis
', 'href=\"#barriers_to_effective_analysis\">did', 'class=\"icon-calendar\">9/2/2015
ქართული
', 'added', 'id=\"toctogglecheckbox\"', 'id=\"mailing-category-input\">without', 'class=\"catlinks\"', 'inevitably', 'processing\">editkeepcoding', 'href=\"https//wwwcoursereportcom/schools/ironhack?rel=nofollow&shared_review=16276#reviews/review/16276\"', ' references
⎦]', 'href=\"http//itlipsumcom/\">italiano', 'text-center\">', 'agree', 'itemtype=\"http//schemaorg/aggregaterating\">avg', 'carolinas', 'href=\"http//wwwlinkedincom/in/ronaldricardo\">verified', '
how', 'attack', 'title=\"morse', 'frequency', 'vero', 'analysis-american', '/>