ta-data-mad · VickyViana · Apr 9, 2021 · Apr 10, 2021 · Apr 10, 2021 · Apr 10, 2021
diff --git a/.gitignore b/.gitignore
@@ -132,4 +132,6 @@ dmypy.json
 .DS_Store
 
 # Miscellaneous
-.idea
+.idea
+
+tweets.csv
diff --git a/module-3/natural-language-processing/.gitignore b/module-3/natural-language-processing/.gitignore
@@ -0,0 +1,138 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# iOS
+.DS_Store
+
+# PyCharm
+.idea
+
+__trash__
+tweets.csv
diff --git a/module-3/natural-language-processing/your-code/challenge-1.ipynb b/module-3/natural-language-processing/your-code/challenge-1.ipynb
@@ -66,10 +66,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 55,
    "metadata": {},
    "outputs": [],
    "source": [
+    "import re\n",
+    "\n",
+    "\n",
     "def clean_up(s):\n",
     "    \"\"\"\n",
     "    Cleans up numbers, URLs, and special characters from a string.\n",
@@ -79,7 +82,39 @@
     "\n",
     "    Returns:\n",
     "        A string that has been cleaned up.\n",
-    "    \"\"\""
+    "    \"\"\"\n",
+    "    no_url = re.sub(r'http\\S+',' ', s)\n",
+    "    no_scha= re.sub(r'\\W+', ' ', no_url)\n",
+    "    no_num = re.sub(r'\\d+', ' ', no_scha)\n",
+    "    no_cap = no_num.lower()\n",
+    "    return no_cap"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = \"@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " ironhack s q website   is   \n"
+     ]
+    }
+   ],
+   "source": [
+    "text_clean = clean_up(text)\n",
+    "print(text_clean)"
    ]
   },
   {
@@ -101,7 +136,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 64,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import nltk"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -114,7 +158,27 @@
     "\n",
     "    Returns:\n",
     "        A list of words as the result of tokenization.\n",
-    "    \"\"\""
+    "    \"\"\"\n",
+    "    word_list = nltk.tokenize.word_tokenize(s)\n",
+    "    return word_list"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 74,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['ironhack', 's', 'q', 'website', 'is']\n"
+     ]
+    }
+   ],
+   "source": [
+    "word_list = tokenize(text_clean)\n",
+    "print(word_list)"
    ]
   },
   {
@@ -145,7 +209,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 87,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nltk.stem import PorterStemmer\n",
+    "from nltk.stem import WordNetLemmatizer\n",
+    "ps = PorterStemmer()\n",
+    "lemmatizer = WordNetLemmatizer()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -158,7 +234,33 @@
     "\n",
     "    Returns:\n",
     "        A list of strings after being stemmed and lemmatized.\n",
-    "    \"\"\""
+    "    \"\"\"\n",
+    "    stemming =[]\n",
+    "    for x in l:\n",
+    "        stemming.append(ps.stem(x))\n",
+    "    \n",
+    "    lemmatizing =[]\n",
+    "    for y in stemming:\n",
+    "        lemmatizing.append(lemmatizer.lemmatize(y))\n",
+    "    return lemmatizing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 99,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['ironhack', 's', 'q', 'websit', 'is']\n"
+     ]
+    }
+   ],
+   "source": [
+    "clean_word_list = stem_and_lemmatize(word_list)\n",
+    "print(clean_word_list)"
    ]
   },
   {
@@ -176,7 +278,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 92,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nltk.corpus import stopwords"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 95,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stop_words = set(stopwords.words('english')) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 98,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -189,7 +309,32 @@
     "\n",
     "    Returns:\n",
     "        A list of strings after stop words are removed.\n",
-    "    \"\"\""
+    "    \"\"\"\n",
+    "    list_sw = []\n",
+    "    for z in l:\n",
+    "        if z not in stop_words:\n",
+    "            list_sw.append(z)\n",
+    "    return list_sw"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 100,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['ironhack', 'q', 'websit']"
+      ]
+     },
+     "execution_count": 100,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "remove_stopwords(clean_word_list)"
    ]
   },
   {
@@ -204,9 +349,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python [conda env:ironhack_env]",
    "language": "python",
-   "name": "python3"
+   "name": "conda-env-ironhack_env-py"
   },
   "language_info": {
    "codemirror_mode": {
@@ -218,7 +363,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.3"
+   "version": "3.8.5"
   }
  },
  "nbformat": 4,