Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -132,4 +132,6 @@ dmypy.json
.DS_Store

# Miscellaneous
.idea
.idea

tweets.csv
138 changes: 138 additions & 0 deletions module-3/natural-language-processing/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# iOS
.DS_Store

# PyCharm
.idea

__trash__
tweets.csv
167 changes: 156 additions & 11 deletions module-3/natural-language-processing/your-code/challenge-1.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,13 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"\n",
"\n",
"def clean_up(s):\n",
" \"\"\"\n",
" Cleans up numbers, URLs, and special characters from a string.\n",
Expand All @@ -79,7 +82,39 @@
"\n",
" Returns:\n",
" A string that has been cleaned up.\n",
" \"\"\""
" \"\"\"\n",
" no_url = re.sub(r'http\\S+',' ', s)\n",
" no_scha= re.sub(r'\\W+', ' ', no_url)\n",
" no_num = re.sub(r'\\d+', ' ', no_scha)\n",
" no_cap = no_num.lower()\n",
" return no_cap"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"text = \"@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]\""
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ironhack s q website is \n"
]
}
],
"source": [
"text_clean = clean_up(text)\n",
"print(text_clean)"
]
},
{
Expand All @@ -101,7 +136,16 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"import nltk"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -114,7 +158,27 @@
"\n",
" Returns:\n",
" A list of words as the result of tokenization.\n",
" \"\"\""
" \"\"\"\n",
" word_list = nltk.tokenize.word_tokenize(s)\n",
" return word_list"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['ironhack', 's', 'q', 'website', 'is']\n"
]
}
],
"source": [
"word_list = tokenize(text_clean)\n",
"print(word_list)"
]
},
{
Expand Down Expand Up @@ -145,7 +209,19 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 87,
"metadata": {},
"outputs": [],
"source": [
"from nltk.stem import PorterStemmer\n",
"from nltk.stem import WordNetLemmatizer\n",
"ps = PorterStemmer()\n",
"lemmatizer = WordNetLemmatizer()"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -158,7 +234,33 @@
"\n",
" Returns:\n",
" A list of strings after being stemmed and lemmatized.\n",
" \"\"\""
" \"\"\"\n",
" stemming =[]\n",
" for x in l:\n",
" stemming.append(ps.stem(x))\n",
" \n",
" lemmatizing =[]\n",
" for y in stemming:\n",
" lemmatizing.append(lemmatizer.lemmatize(y))\n",
" return lemmatizing"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['ironhack', 's', 'q', 'websit', 'is']\n"
]
}
],
"source": [
"clean_word_list = stem_and_lemmatize(word_list)\n",
"print(clean_word_list)"
]
},
{
Expand All @@ -176,7 +278,25 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 92,
"metadata": {},
"outputs": [],
"source": [
"from nltk.corpus import stopwords"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [],
"source": [
"stop_words = set(stopwords.words('english')) "
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -189,7 +309,32 @@
"\n",
" Returns:\n",
" A list of strings after stop words are removed.\n",
" \"\"\""
" \"\"\"\n",
" list_sw = []\n",
" for z in l:\n",
" if z not in stop_words:\n",
" list_sw.append(z)\n",
" return list_sw"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['ironhack', 'q', 'websit']"
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"remove_stopwords(clean_word_list)"
]
},
{
Expand All @@ -204,9 +349,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python [conda env:ironhack_env]",
"language": "python",
"name": "python3"
"name": "conda-env-ironhack_env-py"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -218,7 +363,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
"version": "3.8.5"
}
},
"nbformat": 4,
Expand Down
Loading