Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
289 changes: 266 additions & 23 deletions your-code/main.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": []
"source": [
"import pandas as pd"
]
},
{
"cell_type": "markdown",
Expand All @@ -25,10 +27,101 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": []
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>userId</th>\n",
" <th>Reputation</th>\n",
" <th>Views</th>\n",
" <th>UpVotes</th>\n",
" <th>DownVotes</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>-1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>5007</td>\n",
" <td>1920</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>101</td>\n",
" <td>25</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>101</td>\n",
" <td>22</td>\n",
" <td>19</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>101</td>\n",
" <td>11</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>6792</td>\n",
" <td>1145</td>\n",
" <td>662</td>\n",
" <td>5</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" userId Reputation Views UpVotes DownVotes\n",
"0 -1 1 0 5007 1920\n",
"1 2 101 25 3 0\n",
"2 3 101 22 19 0\n",
"3 4 101 11 0 0\n",
"4 5 6792 1145 662 5"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"users = pd.read_csv(\"../data/users.csv\", index_col = 0)\n",
"users.head()"
]
},
{
"cell_type": "markdown",
Expand All @@ -42,7 +135,9 @@
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"#users.rename(columns={x : 'UserId'})"
]
},
{
"cell_type": "markdown",
Expand All @@ -53,10 +148,100 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": []
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PostId</th>\n",
" <th>userId</th>\n",
" <th>Score</th>\n",
" <th>ViewCount</th>\n",
" <th>CommentCount</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>8.0</td>\n",
" <td>23</td>\n",
" <td>1278.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>24.0</td>\n",
" <td>22</td>\n",
" <td>8198.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>18.0</td>\n",
" <td>54</td>\n",
" <td>3613.0</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>23.0</td>\n",
" <td>13</td>\n",
" <td>5224.0</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>23.0</td>\n",
" <td>81</td>\n",
" <td>NaN</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" PostId userId Score ViewCount CommentCount\n",
"0 1 8.0 23 1278.0 1\n",
"1 2 24.0 22 8198.0 1\n",
"2 3 18.0 54 3613.0 4\n",
"3 4 23.0 13 5224.0 2\n",
"4 5 23.0 81 NaN 3"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"posts = pd.read_csv(\"../data/posts.csv\", index_col = 0)\n",
"posts.head()"
]
},
{
"cell_type": "markdown",
Expand All @@ -70,7 +255,9 @@
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"#posts.rename(columns={x,y : 'userId'})"
]
},
{
"cell_type": "markdown",
Expand All @@ -83,10 +270,13 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": []
"source": [
"new_users = users[[\"userId\", \"Reputation\", \"Views\", \"UpVotes\", \"DownVotes\"]]\n",
"new_posts = posts[[\"PostId\", \"Score\", \"userId\", \"ViewCount\", \"CommentCount\"]]"
]
},
{
"cell_type": "markdown",
Expand All @@ -98,10 +288,12 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": []
"source": [
"new_users_post_df = new_users.merge(right=new_posts, how=\"inner\", on=\"userId\")"
]
},
{
"cell_type": "markdown",
Expand All @@ -112,10 +304,33 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": []
"outputs": [
{
"data": {
"text/plain": [
"userId 0\n",
"Reputation 0\n",
"Views 0\n",
"UpVotes 0\n",
"DownVotes 0\n",
"PostId 0\n",
"Score 0\n",
"ViewCount 48396\n",
"CommentCount 0\n",
"dtype: int64"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"new_users_post_df.isnull().sum()\n",
"#in ViewCount there're a lot of missing values"
]
},
{
"cell_type": "markdown",
Expand All @@ -127,10 +342,15 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": []
"source": [
"# One way will be imputation Using k-NN. However, I don't know how implant it, so I will fill them with the mean of the column (dummy option)\n",
"import numpy as np\n",
"mean_ViewCount = new_users_post_df[\"ViewCount\"].mean()\n",
"new_users_post_df[\"ViewCount\"].replace((np.nan, mean_ViewCount), inplace =True)"
]
},
{
"cell_type": "markdown",
Expand All @@ -141,10 +361,33 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": []
"outputs": [
{
"data": {
"text/plain": [
"userId int64\n",
"Reputation int64\n",
"Views int64\n",
"UpVotes int64\n",
"DownVotes int64\n",
"PostId int64\n",
"Score int64\n",
"ViewCount float64\n",
"CommentCount int64\n",
"dtype: object"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"new_users_post_df.dtypes\n",
"# one solution would be to be uniforme type object as int64: new_users_post_df[\"ViewCount\"]= new_users_post_df[\"ViewCount\"].astype(int)"
]
}
],
"metadata": {
Expand All @@ -163,7 +406,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.2"
"version": "3.7.1"
}
},
"nbformat": 4,
Expand Down