diff --git a/your-code/main.ipynb b/your-code/main.ipynb
index 31724c5..55465b9 100644
--- a/your-code/main.ipynb
+++ b/your-code/main.ipynb
@@ -9,10 +9,12 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "import pandas as pd"
+ ]
},
{
"cell_type": "markdown",
@@ -23,10 +25,13 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "import pymysql\n",
+ "import sqlalchemy"
+ ]
},
{
"cell_type": "markdown",
@@ -37,10 +42,12 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 8,
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "engine = sqlalchemy.create_engine('mysql+pymysql://guest:relational@relational.fit.cvut.cz:3306/classicmodels')"
+ ]
},
{
"cell_type": "markdown",
@@ -51,10 +58,32 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 18,
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "sqlSentence = 'SELECT * FROM stats.users'\n",
+ "users_table = pd.read_sql_query(sqlSentence, engine)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": " Id Reputation CreationDate DisplayName LastAccessDate \\\n0 -1 1 2010-07-19 06:55:26 Community 2010-07-19 06:55:26 \n1 2 101 2010-07-19 14:01:36 Geoff Dalgas 2013-11-12 22:07:23 \n2 3 101 2010-07-19 15:34:50 Jarrod Dixon 2014-08-08 06:42:58 \n3 4 101 2010-07-19 19:03:27 Emmett 2014-01-02 09:31:02 \n4 5 6792 2010-07-19 19:03:57 Shane 2014-08-13 00:23:47 \n\n WebsiteUrl Location \\\n0 http://meta.stackexchange.com/ on the server farm \n1 http://stackoverflow.com Corvallis, OR \n2 http://stackoverflow.com New York, NY \n3 http://minesweeperonline.com San Francisco, CA \n4 http://www.statalgo.com New York, NY \n\n AboutMe Views UpVotes \\\n0
Hi, I'm not really a person.
\\n\\nI'm ... 0 5007 \n1
Developer on the StackOverflow team. Find ... 25 3 \n2
currently at a startup in SF
\\n\\nform... 11 0 \n4
Quantitative researcher focusing on statist... 1145 662 \n\n DownVotes AccountId Age ProfileImageUrl \n0 1920 -1 NaN None \n1 0 2 37.0 None \n2 0 3 35.0 None \n3 0 1998 28.0 http://i.stack.imgur.com/d1oHX.jpg \n4 5 54503 35.0 None ",
+ "text/html": "
\n\n
\n \n \n | \n Id | \n Reputation | \n CreationDate | \n DisplayName | \n LastAccessDate | \n WebsiteUrl | \n Location | \n AboutMe | \n Views | \n UpVotes | \n DownVotes | \n AccountId | \n Age | \n ProfileImageUrl | \n
\n \n \n \n | 0 | \n -1 | \n 1 | \n 2010-07-19 06:55:26 | \n Community | \n 2010-07-19 06:55:26 | \n http://meta.stackexchange.com/ | \n on the server farm | \n <p>Hi, I'm not really a person.</p>\\n\\n<p>I'm ... | \n 0 | \n 5007 | \n 1920 | \n -1 | \n NaN | \n None | \n
\n \n | 1 | \n 2 | \n 101 | \n 2010-07-19 14:01:36 | \n Geoff Dalgas | \n 2013-11-12 22:07:23 | \n http://stackoverflow.com | \n Corvallis, OR | \n <p>Developer on the StackOverflow team. Find ... | \n 25 | \n 3 | \n 0 | \n 2 | \n 37.0 | \n None | \n
\n \n | 2 | \n 3 | \n 101 | \n 2010-07-19 15:34:50 | \n Jarrod Dixon | \n 2014-08-08 06:42:58 | \n http://stackoverflow.com | \n New York, NY | \n <p><a href=\"http://blog.stackoverflow.com/2009... | \n 22 | \n 19 | \n 0 | \n 3 | \n 35.0 | \n None | \n
\n \n | 3 | \n 4 | \n 101 | \n 2010-07-19 19:03:27 | \n Emmett | \n 2014-01-02 09:31:02 | \n http://minesweeperonline.com | \n San Francisco, CA | \n <p>currently at a startup in SF</p>\\n\\n<p>form... | \n 11 | \n 0 | \n 0 | \n 1998 | \n 28.0 | \n http://i.stack.imgur.com/d1oHX.jpg | \n
\n \n | 4 | \n 5 | \n 6792 | \n 2010-07-19 19:03:57 | \n Shane | \n 2014-08-13 00:23:47 | \n http://www.statalgo.com | \n New York, NY | \n <p>Quantitative researcher focusing on statist... | \n 1145 | \n 662 | \n 5 | \n 54503 | \n 35.0 | \n None | \n
\n \n
\n
"
+ },
+ "metadata": {},
+ "execution_count": 19
+ }
+ ],
+ "source": [
+ "users_table.head()"
+ ]
},
{
"cell_type": "markdown",
@@ -65,10 +94,31 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 20,
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "users_table.rename(columns = {'Id':'userId'}, inplace = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": " userId Reputation CreationDate DisplayName LastAccessDate \\\n0 -1 1 2010-07-19 06:55:26 Community 2010-07-19 06:55:26 \n1 2 101 2010-07-19 14:01:36 Geoff Dalgas 2013-11-12 22:07:23 \n2 3 101 2010-07-19 15:34:50 Jarrod Dixon 2014-08-08 06:42:58 \n3 4 101 2010-07-19 19:03:27 Emmett 2014-01-02 09:31:02 \n4 5 6792 2010-07-19 19:03:57 Shane 2014-08-13 00:23:47 \n\n WebsiteUrl Location \\\n0 http://meta.stackexchange.com/ on the server farm \n1 http://stackoverflow.com Corvallis, OR \n2 http://stackoverflow.com New York, NY \n3 http://minesweeperonline.com San Francisco, CA \n4 http://www.statalgo.com New York, NY \n\n AboutMe Views UpVotes \\\n0 Hi, I'm not really a person.
\\n\\nI'm ... 0 5007 \n1
Developer on the StackOverflow team. Find ... 25 3 \n2
currently at a startup in SF
\\n\\nform... 11 0 \n4
Quantitative researcher focusing on statist... 1145 662 \n\n DownVotes AccountId Age ProfileImageUrl \n0 1920 -1 NaN None \n1 0 2 37.0 None \n2 0 3 35.0 None \n3 0 1998 28.0 http://i.stack.imgur.com/d1oHX.jpg \n4 5 54503 35.0 None ",
+ "text/html": "
\n\n
\n \n \n | \n userId | \n Reputation | \n CreationDate | \n DisplayName | \n LastAccessDate | \n WebsiteUrl | \n Location | \n AboutMe | \n Views | \n UpVotes | \n DownVotes | \n AccountId | \n Age | \n ProfileImageUrl | \n
\n \n \n \n | 0 | \n -1 | \n 1 | \n 2010-07-19 06:55:26 | \n Community | \n 2010-07-19 06:55:26 | \n http://meta.stackexchange.com/ | \n on the server farm | \n <p>Hi, I'm not really a person.</p>\\n\\n<p>I'm ... | \n 0 | \n 5007 | \n 1920 | \n -1 | \n NaN | \n None | \n
\n \n | 1 | \n 2 | \n 101 | \n 2010-07-19 14:01:36 | \n Geoff Dalgas | \n 2013-11-12 22:07:23 | \n http://stackoverflow.com | \n Corvallis, OR | \n <p>Developer on the StackOverflow team. Find ... | \n 25 | \n 3 | \n 0 | \n 2 | \n 37.0 | \n None | \n
\n \n | 2 | \n 3 | \n 101 | \n 2010-07-19 15:34:50 | \n Jarrod Dixon | \n 2014-08-08 06:42:58 | \n http://stackoverflow.com | \n New York, NY | \n <p><a href=\"http://blog.stackoverflow.com/2009... | \n 22 | \n 19 | \n 0 | \n 3 | \n 35.0 | \n None | \n
\n \n | 3 | \n 4 | \n 101 | \n 2010-07-19 19:03:27 | \n Emmett | \n 2014-01-02 09:31:02 | \n http://minesweeperonline.com | \n San Francisco, CA | \n <p>currently at a startup in SF</p>\\n\\n<p>form... | \n 11 | \n 0 | \n 0 | \n 1998 | \n 28.0 | \n http://i.stack.imgur.com/d1oHX.jpg | \n
\n \n | 4 | \n 5 | \n 6792 | \n 2010-07-19 19:03:57 | \n Shane | \n 2014-08-13 00:23:47 | \n http://www.statalgo.com | \n New York, NY | \n <p>Quantitative researcher focusing on statist... | \n 1145 | \n 662 | \n 5 | \n 54503 | \n 35.0 | \n None | \n
\n \n
\n
"
+ },
+ "metadata": {},
+ "execution_count": 21
+ }
+ ],
+ "source": [
+ "users_table.head()"
+ ]
},
{
"cell_type": "markdown",
@@ -79,10 +129,32 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 22,
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "sqlSentence = 'SELECT * FROM stats.posts'\n",
+ "posts_table = pd.read_sql_query(sqlSentence, engine)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": " Id PostTypeId AcceptedAnswerId CreaionDate Score ViewCount \\\n0 1 1 15.0 2010-07-19 19:12:12 23 1278.0 \n1 2 1 59.0 2010-07-19 19:12:57 22 8198.0 \n2 3 1 5.0 2010-07-19 19:13:28 54 3613.0 \n3 4 1 135.0 2010-07-19 19:13:31 13 5224.0 \n4 5 2 NaN 2010-07-19 19:14:43 81 NaN \n\n Body OwnerUserId \\\n0 How should I elicit prior distributions fro... 8.0 \n1
In many different statistical methods there... 24.0 \n2
What are some valuable Statistical Analysis... 18.0 \n3
I have two groups of data. Each with a dif... 23.0 \n4
The R-project
\\n\\n\n\n
\n \n \n | \n Id | \n PostTypeId | \n AcceptedAnswerId | \n CreaionDate | \n Score | \n ViewCount | \n Body | \n OwnerUserId | \n LasActivityDate | \n Title | \n ... | \n AnswerCount | \n CommentCount | \n FavoriteCount | \n LastEditorUserId | \n LastEditDate | \n CommunityOwnedDate | \n ParentId | \n ClosedDate | \n OwnerDisplayName | \n LastEditorDisplayName | \n
\n \n \n \n | 0 | \n 1 | \n 1 | \n 15.0 | \n 2010-07-19 19:12:12 | \n 23 | \n 1278.0 | \n <p>How should I elicit prior distributions fro... | \n 8.0 | \n 2010-09-15 21:08:26 | \n Eliciting priors from experts | \n ... | \n 5.0 | \n 1 | \n 14.0 | \n NaN | \n NaT | \n NaT | \n NaN | \n NaT | \n None | \n None | \n
\n \n | 1 | \n 2 | \n 1 | \n 59.0 | \n 2010-07-19 19:12:57 | \n 22 | \n 8198.0 | \n <p>In many different statistical methods there... | \n 24.0 | \n 2012-11-12 09:21:54 | \n What is normality? | \n ... | \n 7.0 | \n 1 | \n 8.0 | \n 88.0 | \n 2010-08-07 17:56:44 | \n NaT | \n NaN | \n NaT | \n None | \n None | \n
\n \n | 2 | \n 3 | \n 1 | \n 5.0 | \n 2010-07-19 19:13:28 | \n 54 | \n 3613.0 | \n <p>What are some valuable Statistical Analysis... | \n 18.0 | \n 2013-05-27 14:48:36 | \n What are some valuable Statistical Analysis op... | \n ... | \n 19.0 | \n 4 | \n 36.0 | \n 183.0 | \n 2011-02-12 05:50:03 | \n 2010-07-19 19:13:28 | \n NaN | \n NaT | \n None | \n None | \n
\n \n | 3 | \n 4 | \n 1 | \n 135.0 | \n 2010-07-19 19:13:31 | \n 13 | \n 5224.0 | \n <p>I have two groups of data. Each with a dif... | \n 23.0 | \n 2010-09-08 03:00:19 | \n Assessing the significance of differences in d... | \n ... | \n 5.0 | \n 2 | \n 2.0 | \n NaN | \n NaT | \n NaT | \n NaN | \n NaT | \n None | \n None | \n
\n \n | 4 | \n 5 | \n 2 | \n NaN | \n 2010-07-19 19:14:43 | \n 81 | \n NaN | \n <p>The R-project</p>\\n\\n<p><a href=\"http://www... | \n 23.0 | \n 2010-07-19 19:21:15 | \n None | \n ... | \n NaN | \n 3 | \n NaN | \n 23.0 | \n 2010-07-19 19:21:15 | \n 2010-07-19 19:14:43 | \n 3.0 | \n NaT | \n None | \n None | \n
\n \n
\n5 rows × 21 columns
\n"
+ },
+ "metadata": {},
+ "execution_count": 23
+ }
+ ],
+ "source": [
+ "posts_table.head()"
+ ]
},
{
"cell_type": "markdown",
@@ -93,10 +165,31 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 24,
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "posts_table.rename(columns = {'Id':'postId', 'OwnerUserId':'userId'}, inplace = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": " postId PostTypeId AcceptedAnswerId CreaionDate Score ViewCount \\\n0 1 1 15.0 2010-07-19 19:12:12 23 1278.0 \n1 2 1 59.0 2010-07-19 19:12:57 22 8198.0 \n2 3 1 5.0 2010-07-19 19:13:28 54 3613.0 \n3 4 1 135.0 2010-07-19 19:13:31 13 5224.0 \n4 5 2 NaN 2010-07-19 19:14:43 81 NaN \n\n Body userId \\\n0 How should I elicit prior distributions fro... 8.0 \n1
In many different statistical methods there... 24.0 \n2
What are some valuable Statistical Analysis... 18.0 \n3
I have two groups of data. Each with a dif... 23.0 \n4
The R-project
\\n\\n\n\n
\n \n \n | \n postId | \n PostTypeId | \n AcceptedAnswerId | \n CreaionDate | \n Score | \n ViewCount | \n Body | \n userId | \n LasActivityDate | \n Title | \n ... | \n AnswerCount | \n CommentCount | \n FavoriteCount | \n LastEditorUserId | \n LastEditDate | \n CommunityOwnedDate | \n ParentId | \n ClosedDate | \n OwnerDisplayName | \n LastEditorDisplayName | \n
\n \n \n \n | 0 | \n 1 | \n 1 | \n 15.0 | \n 2010-07-19 19:12:12 | \n 23 | \n 1278.0 | \n <p>How should I elicit prior distributions fro... | \n 8.0 | \n 2010-09-15 21:08:26 | \n Eliciting priors from experts | \n ... | \n 5.0 | \n 1 | \n 14.0 | \n NaN | \n NaT | \n NaT | \n NaN | \n NaT | \n None | \n None | \n
\n \n | 1 | \n 2 | \n 1 | \n 59.0 | \n 2010-07-19 19:12:57 | \n 22 | \n 8198.0 | \n <p>In many different statistical methods there... | \n 24.0 | \n 2012-11-12 09:21:54 | \n What is normality? | \n ... | \n 7.0 | \n 1 | \n 8.0 | \n 88.0 | \n 2010-08-07 17:56:44 | \n NaT | \n NaN | \n NaT | \n None | \n None | \n
\n \n | 2 | \n 3 | \n 1 | \n 5.0 | \n 2010-07-19 19:13:28 | \n 54 | \n 3613.0 | \n <p>What are some valuable Statistical Analysis... | \n 18.0 | \n 2013-05-27 14:48:36 | \n What are some valuable Statistical Analysis op... | \n ... | \n 19.0 | \n 4 | \n 36.0 | \n 183.0 | \n 2011-02-12 05:50:03 | \n 2010-07-19 19:13:28 | \n NaN | \n NaT | \n None | \n None | \n
\n \n | 3 | \n 4 | \n 1 | \n 135.0 | \n 2010-07-19 19:13:31 | \n 13 | \n 5224.0 | \n <p>I have two groups of data. Each with a dif... | \n 23.0 | \n 2010-09-08 03:00:19 | \n Assessing the significance of differences in d... | \n ... | \n 5.0 | \n 2 | \n 2.0 | \n NaN | \n NaT | \n NaT | \n NaN | \n NaT | \n None | \n None | \n
\n \n | 4 | \n 5 | \n 2 | \n NaN | \n 2010-07-19 19:14:43 | \n 81 | \n NaN | \n <p>The R-project</p>\\n\\n<p><a href=\"http://www... | \n 23.0 | \n 2010-07-19 19:21:15 | \n None | \n ... | \n NaN | \n 3 | \n NaN | \n 23.0 | \n 2010-07-19 19:21:15 | \n 2010-07-19 19:14:43 | \n 3.0 | \n NaT | \n None | \n None | \n
\n \n
\n5 rows × 21 columns
\n"
+ },
+ "metadata": {},
+ "execution_count": 25
+ }
+ ],
+ "source": [
+ "posts_table.head()"
+ ]
},
{
"cell_type": "markdown",
@@ -109,10 +202,51 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 26,
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "new_users = users_table[['userId','Reputation','Views','UpVotes','DownVotes']]\n",
+ "new_posts = posts_table[['postId','Score','userId','ViewCount','CommentCount']]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": " userId Reputation Views UpVotes DownVotes\n0 -1 1 0 5007 1920\n1 2 101 25 3 0\n2 3 101 22 19 0\n3 4 101 11 0 0\n4 5 6792 1145 662 5",
+ "text/html": "\n\n
\n \n \n | \n userId | \n Reputation | \n Views | \n UpVotes | \n DownVotes | \n
\n \n \n \n | 0 | \n -1 | \n 1 | \n 0 | \n 5007 | \n 1920 | \n
\n \n | 1 | \n 2 | \n 101 | \n 25 | \n 3 | \n 0 | \n
\n \n | 2 | \n 3 | \n 101 | \n 22 | \n 19 | \n 0 | \n
\n \n | 3 | \n 4 | \n 101 | \n 11 | \n 0 | \n 0 | \n
\n \n | 4 | \n 5 | \n 6792 | \n 1145 | \n 662 | \n 5 | \n
\n \n
\n
"
+ },
+ "metadata": {},
+ "execution_count": 27
+ }
+ ],
+ "source": [
+ "new_users.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": " postId Score userId ViewCount CommentCount\n0 1 23 8.0 1278.0 1\n1 2 22 24.0 8198.0 1\n2 3 54 18.0 3613.0 4\n3 4 13 23.0 5224.0 2\n4 5 81 23.0 NaN 3",
+ "text/html": "\n\n
\n \n \n | \n postId | \n Score | \n userId | \n ViewCount | \n CommentCount | \n
\n \n \n \n | 0 | \n 1 | \n 23 | \n 8.0 | \n 1278.0 | \n 1 | \n
\n \n | 1 | \n 2 | \n 22 | \n 24.0 | \n 8198.0 | \n 1 | \n
\n \n | 2 | \n 3 | \n 54 | \n 18.0 | \n 3613.0 | \n 4 | \n
\n \n | 3 | \n 4 | \n 13 | \n 23.0 | \n 5224.0 | \n 2 | \n
\n \n | 4 | \n 5 | \n 81 | \n 23.0 | \n NaN | \n 3 | \n
\n \n
\n
"
+ },
+ "metadata": {},
+ "execution_count": 28
+ }
+ ],
+ "source": [
+ "new_posts.head()"
+ ]
},
{
"cell_type": "markdown",
@@ -124,10 +258,23 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 29,
"metadata": {},
- "outputs": [],
- "source": []
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": " userId Reputation Views UpVotes DownVotes postId Score ViewCount \\\n0 -1 1 0 5007 1920 2175 0 NaN \n1 -1 1 0 5007 1920 8576 0 NaN \n2 -1 1 0 5007 1920 8578 0 NaN \n3 -1 1 0 5007 1920 8981 0 NaN \n4 -1 1 0 5007 1920 8982 0 NaN \n\n CommentCount \n0 0 \n1 0 \n2 0 \n3 0 \n4 0 ",
+ "text/html": "\n\n
\n \n \n | \n userId | \n Reputation | \n Views | \n UpVotes | \n DownVotes | \n postId | \n Score | \n ViewCount | \n CommentCount | \n
\n \n \n \n | 0 | \n -1 | \n 1 | \n 0 | \n 5007 | \n 1920 | \n 2175 | \n 0 | \n NaN | \n 0 | \n
\n \n | 1 | \n -1 | \n 1 | \n 0 | \n 5007 | \n 1920 | \n 8576 | \n 0 | \n NaN | \n 0 | \n
\n \n | 2 | \n -1 | \n 1 | \n 0 | \n 5007 | \n 1920 | \n 8578 | \n 0 | \n NaN | \n 0 | \n
\n \n | 3 | \n -1 | \n 1 | \n 0 | \n 5007 | \n 1920 | \n 8981 | \n 0 | \n NaN | \n 0 | \n
\n \n | 4 | \n -1 | \n 1 | \n 0 | \n 5007 | \n 1920 | \n 8982 | \n 0 | \n NaN | \n 0 | \n
\n \n
\n
"
+ },
+ "metadata": {},
+ "execution_count": 29
+ }
+ ],
+ "source": [
+ "users_and_posts = pd.merge(new_users, new_posts)\n",
+ "users_and_posts.head()"
+ ]
},
{
"cell_type": "markdown",
@@ -138,10 +285,20 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 34,
"metadata": {},
- "outputs": [],
- "source": []
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": "userId 0\nReputation 0\nViews 0\nUpVotes 0\nDownVotes 0\npostId 0\nScore 0\nViewCount 48396\nCommentCount 0\ndtype: int64\n\n\nWe have 48396 missing values in our merged Dataframe, all of which are located in the ViweCount column.\n"
+ }
+ ],
+ "source": [
+ "print(users_and_posts.isna().sum())\n",
+ "print('\\n')\n",
+ "print('We have {} missing values in our merged Dataframe, all of which are located in the ViweCount column.'.format(users_and_posts['ViewCount'].isna().sum()))"
+ ]
},
{
"cell_type": "markdown",
@@ -153,10 +310,42 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 41,
"metadata": {},
- "outputs": [],
- "source": []
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": "{1.0, 2.0}"
+ },
+ "metadata": {},
+ "execution_count": 41
+ }
+ ],
+ "source": [
+ "#Let's check which values does the column ViewCount has. \n",
+ "set(users_and_posts.ViewCount.sort_values().head())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": "{0.0,\n 1.0,\n 2.0,\n 3.0,\n 4.0,\n 5.0,\n 6.0,\n 7.0,\n 8.0,\n 9.0,\n 10.0,\n 11.0,\n 12.0,\n 13.0,\n 14.0,\n 15.0,\n 16.0,\n 17.0,\n 18.0,\n 19.0,\n 20.0,\n 21.0,\n 22.0,\n 23.0,\n 24.0,\n 25.0,\n 26.0,\n 27.0,\n 28.0,\n 29.0,\n 30.0,\n 31.0,\n 32.0,\n 33.0,\n 34.0,\n 35.0,\n 36.0,\n 37.0,\n 38.0,\n 39.0,\n 40.0,\n 41.0,\n 42.0,\n 43.0,\n 44.0,\n 45.0,\n 46.0,\n 47.0,\n 48.0,\n 49.0,\n 50.0,\n 51.0,\n 52.0,\n 53.0,\n 54.0,\n 55.0,\n 56.0,\n 57.0,\n 58.0,\n 59.0,\n 60.0,\n 61.0,\n 62.0,\n 63.0,\n 64.0,\n 65.0,\n 66.0,\n 67.0,\n 68.0,\n 69.0,\n 70.0,\n 71.0,\n 72.0,\n 73.0,\n 74.0,\n 75.0,\n 76.0,\n 77.0,\n 78.0,\n 79.0,\n 80.0,\n 81.0,\n 82.0,\n 83.0,\n 84.0,\n 85.0,\n 86.0,\n 87.0,\n 88.0,\n 89.0,\n 90.0,\n 91.0,\n 92.0,\n 93.0,\n 94.0,\n 95.0,\n 96.0,\n 97.0,\n 98.0,\n 99.0,\n 100.0,\n 101.0,\n 102.0,\n 103.0,\n 104.0,\n 105.0,\n 106.0,\n 107.0,\n 108.0,\n 109.0,\n 110.0,\n 111.0,\n 112.0,\n 113.0,\n 114.0,\n 115.0,\n 116.0,\n 117.0,\n 118.0,\n 119.0,\n 120.0,\n 121.0,\n 122.0,\n 123.0,\n 124.0,\n 125.0,\n 126.0,\n 127.0,\n 128.0,\n 129.0,\n 130.0,\n 131.0,\n 132.0,\n 133.0,\n 134.0,\n 135.0,\n 136.0,\n 137.0,\n 138.0,\n 139.0,\n 140.0,\n 141.0,\n 142.0,\n 143.0,\n 144.0,\n 145.0,\n 146.0,\n 147.0,\n 148.0,\n 149.0,\n 150.0,\n 151.0,\n 152.0,\n 153.0,\n 154.0,\n 155.0,\n 156.0,\n 157.0,\n 158.0,\n 159.0,\n 160.0,\n 161.0,\n 162.0,\n 163.0,\n 164.0,\n 165.0,\n 166.0,\n 167.0,\n 168.0,\n 169.0,\n 170.0,\n 171.0,\n 172.0,\n 173.0,\n 174.0,\n 175.0,\n 176.0,\n 177.0,\n 178.0,\n 179.0,\n 180.0,\n 181.0,\n 182.0,\n 183.0,\n 184.0,\n 185.0,\n 186.0,\n 187.0,\n 188.0,\n 189.0,\n 190.0,\n 191.0,\n 192.0,\n 193.0,\n 194.0,\n 195.0,\n 196.0,\n 197.0,\n 198.0,\n 199.0,\n 200.0,\n 201.0,\n 202.0,\n 203.0,\n 204.0,\n 205.0,\n 206.0,\n 207.0,\n 208.0,\n 209.0,\n 210.0,\n 211.0,\n 212.0,\n 213.0,\n 214.0,\n 215.0,\n 216.0,\n 217.0,\n 218.0,\n 219.0,\n 220.0,\n 221.0,\n 222.0,\n 223.0,\n 224.0,\n 225.0,\n 226.0,\n 227.0,\n 228.0,\n 229.0,\n 230.0,\n 231.0,\n 232.0,\n 233.0,\n 234.0,\n 235.0,\n 236.0,\n 237.0,\n 238.0,\n 239.0,\n 240.0,\n 241.0,\n 242.0,\n 243.0,\n 244.0,\n 245.0,\n 246.0,\n 247.0,\n 248.0,\n 249.0,\n 250.0,\n 251.0,\n 252.0,\n 253.0,\n 254.0,\n 255.0,\n 256.0,\n 257.0,\n 258.0,\n 259.0,\n 260.0,\n 261.0,\n 262.0,\n 263.0,\n 264.0,\n 265.0,\n 266.0,\n 267.0,\n 268.0,\n 269.0,\n 270.0,\n 271.0,\n 272.0,\n 273.0,\n 274.0,\n 275.0,\n 276.0,\n 277.0,\n 278.0,\n 279.0,\n 280.0,\n 281.0,\n 282.0,\n 283.0,\n 284.0,\n 285.0,\n 286.0,\n 287.0,\n 288.0,\n 289.0,\n 290.0,\n 291.0,\n 292.0,\n 293.0,\n 294.0,\n 295.0,\n 296.0,\n 297.0,\n 298.0,\n 299.0,\n 300.0,\n 301.0,\n 302.0,\n 303.0,\n 304.0,\n 305.0,\n 306.0,\n 307.0,\n 308.0,\n 309.0,\n 310.0,\n 311.0,\n 312.0,\n 313.0,\n 314.0,\n 315.0,\n 316.0,\n 317.0,\n 318.0,\n 319.0,\n 320.0,\n 321.0,\n 322.0,\n 323.0,\n 324.0,\n 325.0,\n 326.0,\n 327.0,\n 328.0,\n 329.0,\n 330.0,\n 331.0,\n 332.0,\n 333.0,\n 334.0,\n 335.0,\n 336.0,\n 337.0,\n 338.0,\n 339.0,\n 340.0,\n 341.0,\n 342.0,\n 343.0,\n 344.0,\n 345.0,\n 346.0,\n 347.0,\n 348.0,\n 349.0,\n 350.0,\n 351.0,\n 352.0,\n 353.0,\n 354.0,\n 355.0,\n 356.0,\n 357.0,\n 358.0,\n 359.0,\n 360.0,\n 361.0,\n 362.0,\n 363.0,\n 364.0,\n 365.0,\n 366.0,\n 367.0,\n 368.0,\n 369.0,\n 370.0,\n 371.0,\n 372.0,\n 373.0,\n 374.0,\n 375.0,\n 376.0,\n 377.0,\n 378.0,\n 379.0,\n 380.0,\n 381.0,\n 382.0,\n 383.0,\n 384.0,\n 385.0,\n 386.0,\n 387.0,\n 388.0,\n 389.0,\n 390.0,\n 391.0,\n 392.0,\n 393.0,\n 394.0,\n 395.0,\n 396.0,\n 397.0,\n 398.0,\n 399.0,\n 400.0,\n 401.0,\n 402.0,\n 403.0,\n 404.0,\n 405.0,\n 406.0,\n 407.0,\n 408.0,\n 409.0,\n 410.0,\n 411.0,\n 412.0,\n 413.0,\n 414.0,\n 415.0,\n 416.0,\n 417.0,\n 418.0,\n 419.0,\n 420.0,\n 421.0,\n 422.0,\n 423.0,\n 424.0,\n 425.0,\n 426.0,\n 427.0,\n 428.0,\n 429.0,\n 430.0,\n 431.0,\n 432.0,\n 433.0,\n 434.0,\n 435.0,\n 436.0,\n 437.0,\n 438.0,\n 439.0,\n 440.0,\n 441.0,\n 442.0,\n 443.0,\n 444.0,\n 445.0,\n 446.0,\n 447.0,\n 448.0,\n 449.0,\n 450.0,\n 451.0,\n 452.0,\n 453.0,\n 454.0,\n 455.0,\n 456.0,\n 457.0,\n 458.0,\n 459.0,\n 460.0,\n 461.0,\n 462.0,\n 463.0,\n 464.0,\n 465.0,\n 466.0,\n 467.0,\n 468.0,\n 469.0,\n 470.0,\n 471.0,\n 472.0,\n 473.0,\n 474.0,\n 475.0,\n 476.0,\n 477.0,\n 478.0,\n 479.0,\n 480.0,\n 481.0,\n 482.0,\n 483.0,\n 484.0,\n 485.0,\n 486.0,\n 487.0,\n 488.0,\n 489.0,\n 490.0,\n 491.0,\n 492.0,\n 493.0,\n 494.0,\n 495.0,\n 496.0,\n 497.0,\n 498.0,\n 499.0,\n 500.0,\n 501.0,\n 502.0,\n 503.0,\n 504.0,\n 505.0,\n 506.0,\n 507.0,\n 508.0,\n 509.0,\n 510.0,\n 511.0,\n 512.0,\n 513.0,\n 514.0,\n 515.0,\n 516.0,\n 517.0,\n 518.0,\n 519.0,\n 520.0,\n 521.0,\n 522.0,\n 523.0,\n 524.0,\n 525.0,\n 526.0,\n 527.0,\n 528.0,\n 529.0,\n 530.0,\n 531.0,\n 532.0,\n 533.0,\n 534.0,\n 535.0,\n 536.0,\n 537.0,\n 538.0,\n 539.0,\n 540.0,\n 541.0,\n 542.0,\n 543.0,\n 544.0,\n 545.0,\n 546.0,\n 547.0,\n 548.0,\n 549.0,\n 550.0,\n 551.0,\n 552.0,\n 553.0,\n 554.0,\n 555.0,\n 556.0,\n 557.0,\n 558.0,\n 559.0,\n 560.0,\n 561.0,\n 562.0,\n 563.0,\n 564.0,\n 565.0,\n 566.0,\n 567.0,\n 568.0,\n 569.0,\n 570.0,\n 571.0,\n 572.0,\n 573.0,\n 574.0,\n 575.0,\n 576.0,\n 577.0,\n 578.0,\n 579.0,\n 580.0,\n 581.0,\n 582.0,\n 583.0,\n 584.0,\n 585.0,\n 586.0,\n 587.0,\n 588.0,\n 589.0,\n 590.0,\n 591.0,\n 592.0,\n 593.0,\n 594.0,\n 595.0,\n 596.0,\n 597.0,\n 598.0,\n 599.0,\n 600.0,\n 601.0,\n 602.0,\n 603.0,\n 604.0,\n 605.0,\n 606.0,\n 607.0,\n 608.0,\n 609.0,\n 610.0,\n 611.0,\n 612.0,\n 613.0,\n 614.0,\n 615.0,\n 616.0,\n 617.0,\n 618.0,\n 619.0,\n 620.0,\n 621.0,\n 622.0,\n 623.0,\n 624.0,\n 625.0,\n 626.0,\n 627.0,\n 628.0,\n 629.0,\n 630.0,\n 631.0,\n 632.0,\n 633.0,\n 634.0,\n 635.0,\n 636.0,\n 637.0,\n 638.0,\n 639.0,\n 640.0,\n 641.0,\n 642.0,\n 643.0,\n 644.0,\n 645.0,\n 646.0,\n 647.0,\n 648.0,\n 649.0,\n 650.0,\n 651.0,\n 652.0,\n 653.0,\n 654.0,\n 655.0,\n 656.0,\n 657.0,\n 658.0,\n 659.0,\n 660.0,\n 661.0,\n 662.0,\n 663.0,\n 664.0,\n 665.0,\n 666.0,\n 667.0,\n 668.0,\n 669.0,\n 670.0,\n 671.0,\n 672.0,\n 673.0,\n 674.0,\n 675.0,\n 676.0,\n 677.0,\n 678.0,\n 679.0,\n 680.0,\n 681.0,\n 682.0,\n 683.0,\n 684.0,\n 685.0,\n 686.0,\n 687.0,\n 688.0,\n 689.0,\n 690.0,\n 691.0,\n 692.0,\n 693.0,\n 694.0,\n 695.0,\n 696.0,\n 697.0,\n 698.0,\n 699.0,\n 700.0,\n 701.0,\n 702.0,\n 703.0,\n 704.0,\n 705.0,\n 706.0,\n 707.0,\n 708.0,\n 709.0,\n 710.0,\n 711.0,\n 712.0,\n 713.0,\n 714.0,\n 715.0,\n 716.0,\n 717.0,\n 718.0,\n 719.0,\n 720.0,\n 721.0,\n 722.0,\n 723.0,\n 724.0,\n 725.0,\n 726.0,\n 727.0,\n 728.0,\n 729.0,\n 730.0,\n 731.0,\n 732.0,\n 733.0,\n 734.0,\n 735.0,\n 736.0,\n 737.0,\n 738.0,\n 739.0,\n 740.0,\n 741.0,\n 742.0,\n 743.0,\n 744.0,\n 745.0,\n 746.0,\n 747.0,\n 748.0,\n 749.0,\n 750.0,\n 751.0,\n 752.0,\n 753.0,\n 754.0,\n 755.0,\n 756.0,\n 757.0,\n 758.0,\n 759.0,\n 760.0,\n 761.0,\n 762.0,\n 763.0,\n 764.0,\n 765.0,\n 766.0,\n 767.0,\n 768.0,\n 769.0,\n 770.0,\n 771.0,\n 772.0,\n 773.0,\n 774.0,\n 775.0,\n 776.0,\n 777.0,\n 778.0,\n 779.0,\n 780.0,\n 781.0,\n 782.0,\n 783.0,\n 784.0,\n 785.0,\n 786.0,\n 787.0,\n 788.0,\n 789.0,\n 790.0,\n 791.0,\n 792.0,\n 793.0,\n 794.0,\n 795.0,\n 796.0,\n 797.0,\n 798.0,\n 799.0,\n 800.0,\n 801.0,\n 802.0,\n 803.0,\n 804.0,\n 805.0,\n 806.0,\n 807.0,\n 808.0,\n 809.0,\n 810.0,\n 811.0,\n 812.0,\n 813.0,\n 814.0,\n 815.0,\n 816.0,\n 817.0,\n 818.0,\n 819.0,\n 820.0,\n 821.0,\n 822.0,\n 823.0,\n 824.0,\n 825.0,\n 826.0,\n 827.0,\n 828.0,\n 829.0,\n 830.0,\n 831.0,\n 832.0,\n 833.0,\n 834.0,\n 835.0,\n 836.0,\n 837.0,\n 838.0,\n 839.0,\n 840.0,\n 841.0,\n 842.0,\n 843.0,\n 844.0,\n 845.0,\n 846.0,\n 847.0,\n 848.0,\n 849.0,\n 850.0,\n 851.0,\n 852.0,\n 853.0,\n 854.0,\n 855.0,\n 856.0,\n 857.0,\n 858.0,\n 859.0,\n 860.0,\n 861.0,\n 862.0,\n 863.0,\n 864.0,\n 865.0,\n 866.0,\n 867.0,\n 868.0,\n 869.0,\n 870.0,\n 871.0,\n 872.0,\n 873.0,\n 874.0,\n 875.0,\n 876.0,\n 877.0,\n 878.0,\n 879.0,\n 880.0,\n 881.0,\n 882.0,\n 883.0,\n 884.0,\n 885.0,\n 886.0,\n 887.0,\n 888.0,\n 889.0,\n 890.0,\n 891.0,\n 892.0,\n 893.0,\n 894.0,\n 895.0,\n 896.0,\n 897.0,\n 898.0,\n 899.0,\n 900.0,\n 901.0,\n 902.0,\n 903.0,\n 904.0,\n 905.0,\n 906.0,\n 907.0,\n 908.0,\n 909.0,\n 910.0,\n 911.0,\n 912.0,\n 913.0,\n 914.0,\n 915.0,\n 916.0,\n 917.0,\n 918.0,\n 919.0,\n 920.0,\n 921.0,\n 922.0,\n 923.0,\n 924.0,\n 925.0,\n 926.0,\n 927.0,\n 928.0,\n 929.0,\n 930.0,\n 931.0,\n 932.0,\n 933.0,\n 934.0,\n 935.0,\n 936.0,\n 937.0,\n 938.0,\n 939.0,\n 940.0,\n 941.0,\n 942.0,\n 943.0,\n 944.0,\n 945.0,\n 946.0,\n 947.0,\n 948.0,\n 949.0,\n 950.0,\n 951.0,\n 952.0,\n 953.0,\n 954.0,\n 955.0,\n 956.0,\n 957.0,\n 958.0,\n 959.0,\n 960.0,\n 961.0,\n 962.0,\n 963.0,\n 964.0,\n 965.0,\n 966.0,\n 967.0,\n 968.0,\n 969.0,\n 970.0,\n 971.0,\n 972.0,\n 973.0,\n 974.0,\n 975.0,\n 976.0,\n 977.0,\n 978.0,\n 979.0,\n 980.0,\n 981.0,\n 982.0,\n 983.0,\n 984.0,\n 985.0,\n 986.0,\n 987.0,\n 988.0,\n 989.0,\n 990.0,\n 991.0,\n 992.0,\n 993.0,\n 994.0,\n 995.0,\n 996.0,\n 997.0,\n 998.0,\n 999.0,\n ...}"
+ },
+ "metadata": {},
+ "execution_count": 47
+ }
+ ],
+ "source": [
+ "#As we can see, view count starts at 1 and forth, so let's just change all NA values to 0. \n",
+ "users_and_posts.ViewCount.fillna(0, inplace = True)\n",
+ "set(users_and_posts.ViewCount.sort_values())"
+ ]
},
{
"cell_type": "markdown",
@@ -167,10 +356,28 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 48,
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "import numpy as np\n",
+ "users_and_posts.ViewCount = users_and_posts.ViewCount.astype(np.int64)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": "\nInt64Index: 90584 entries, 0 to 90583\nData columns (total 9 columns):\n # Column Non-Null Count Dtype\n--- ------ -------------- -----\n 0 userId 90584 non-null int64\n 1 Reputation 90584 non-null int64\n 2 Views 90584 non-null int64\n 3 UpVotes 90584 non-null int64\n 4 DownVotes 90584 non-null int64\n 5 postId 90584 non-null int64\n 6 Score 90584 non-null int64\n 7 ViewCount 90584 non-null int64\n 8 CommentCount 90584 non-null int64\ndtypes: int64(9)\nmemory usage: 9.4 MB\n"
+ }
+ ],
+ "source": [
+ "users_and_posts.info()"
+ ]
},
{
"cell_type": "markdown",
@@ -182,9 +389,9 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3",
+ "display_name": "Python 3.7.6 64-bit ('anaconda3': virtualenv)",
"language": "python",
- "name": "python3"
+ "name": "python37664bitanaconda3virtualenv0697af1ee67a458e9253591065064715"
},
"language_info": {
"codemirror_mode": {
@@ -196,9 +403,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.5"
+ "version": "3.7.6-final"
}
},
"nbformat": 4,
"nbformat_minor": 2
-}
+}
\ No newline at end of file