ta-data-bcn · anyswds · Nov 1, 2020 · Nov 1, 2020
diff --git a/your-code/main.ipynb b/your-code/main.ipynb
@@ -11,10 +11,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "import pandas as pd"
+   ]
   },
   {
    "cell_type": "markdown",
@@ -25,10 +27,101 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>userId</th>\n",
+       "      <th>Reputation</th>\n",
+       "      <th>Views</th>\n",
+       "      <th>UpVotes</th>\n",
+       "      <th>DownVotes</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>-1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>5007</td>\n",
+       "      <td>1920</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2</td>\n",
+       "      <td>101</td>\n",
+       "      <td>25</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3</td>\n",
+       "      <td>101</td>\n",
+       "      <td>22</td>\n",
+       "      <td>19</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>4</td>\n",
+       "      <td>101</td>\n",
+       "      <td>11</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>5</td>\n",
+       "      <td>6792</td>\n",
+       "      <td>1145</td>\n",
+       "      <td>662</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   userId  Reputation  Views  UpVotes  DownVotes\n",
+       "0      -1           1      0     5007       1920\n",
+       "1       2         101     25        3          0\n",
+       "2       3         101     22       19          0\n",
+       "3       4         101     11        0          0\n",
+       "4       5        6792   1145      662          5"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\n",
+    "users = pd.read_csv(\"../data/users.csv\", index_col = 0)\n",
+    "users.head()"
+   ]
   },
   {
    "cell_type": "markdown",
@@ -42,7 +135,9 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "#users.rename(columns={x : 'UserId'})"
+   ]
   },
   {
    "cell_type": "markdown",
@@ -53,10 +148,100 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>PostId</th>\n",
+       "      <th>userId</th>\n",
+       "      <th>Score</th>\n",
+       "      <th>ViewCount</th>\n",
+       "      <th>CommentCount</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>8.0</td>\n",
+       "      <td>23</td>\n",
+       "      <td>1278.0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2</td>\n",
+       "      <td>24.0</td>\n",
+       "      <td>22</td>\n",
+       "      <td>8198.0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3</td>\n",
+       "      <td>18.0</td>\n",
+       "      <td>54</td>\n",
+       "      <td>3613.0</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>4</td>\n",
+       "      <td>23.0</td>\n",
+       "      <td>13</td>\n",
+       "      <td>5224.0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>5</td>\n",
+       "      <td>23.0</td>\n",
+       "      <td>81</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   PostId  userId  Score  ViewCount  CommentCount\n",
+       "0       1     8.0     23     1278.0             1\n",
+       "1       2    24.0     22     8198.0             1\n",
+       "2       3    18.0     54     3613.0             4\n",
+       "3       4    23.0     13     5224.0             2\n",
+       "4       5    23.0     81        NaN             3"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "posts = pd.read_csv(\"../data/posts.csv\", index_col = 0)\n",
+    "posts.head()"
+   ]
   },
   {
    "cell_type": "markdown",
@@ -70,7 +255,9 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "#posts.rename(columns={x,y : 'userId'})"
+   ]
   },
   {
    "cell_type": "markdown",
@@ -83,10 +270,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "new_users = users[[\"userId\", \"Reputation\", \"Views\", \"UpVotes\", \"DownVotes\"]]\n",
+    "new_posts = posts[[\"PostId\", \"Score\", \"userId\", \"ViewCount\", \"CommentCount\"]]"
+   ]
   },
   {
    "cell_type": "markdown",
@@ -98,10 +288,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "new_users_post_df = new_users.merge(right=new_posts, how=\"inner\", on=\"userId\")"
+   ]
   },
   {
    "cell_type": "markdown",
@@ -112,10 +304,33 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "userId              0\n",
+       "Reputation          0\n",
+       "Views               0\n",
+       "UpVotes             0\n",
+       "DownVotes           0\n",
+       "PostId              0\n",
+       "Score               0\n",
+       "ViewCount       48396\n",
+       "CommentCount        0\n",
+       "dtype: int64"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "new_users_post_df.isnull().sum()\n",
+    "#in ViewCount there're a lot of missing values"
+   ]
   },
   {
    "cell_type": "markdown",
@@ -127,10 +342,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 38,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "# One way will be imputation Using k-NN. However, I don't know how implant it, so I will fill them with the mean of the column (dummy option)\n",
+    "import numpy as np\n",
+    "mean_ViewCount = new_users_post_df[\"ViewCount\"].mean()\n",
+    "new_users_post_df[\"ViewCount\"].replace((np.nan, mean_ViewCount), inplace =True)"
+   ]
   },
   {
    "cell_type": "markdown",
@@ -141,10 +361,33 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 39,
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "userId            int64\n",
+       "Reputation        int64\n",
+       "Views             int64\n",
+       "UpVotes           int64\n",
+       "DownVotes         int64\n",
+       "PostId            int64\n",
+       "Score             int64\n",
+       "ViewCount       float64\n",
+       "CommentCount      int64\n",
+       "dtype: object"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "new_users_post_df.dtypes\n",
+    "# one solution would be to be uniforme type object as int64: new_users_post_df[\"ViewCount\"]= new_users_post_df[\"ViewCount\"].astype(int)"
+   ]
   }
  ],
  "metadata": {
@@ -163,7 +406,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.2"
+   "version": "3.7.1"
   }
  },
  "nbformat": 4,