PhysicsOfData · ElifSeven · Jan 9, 2020 · Jan 12, 2020 · Jan 12, 2020 · Apr 14, 2020
diff --git a/GetDistance.ipynb b/GetDistance.ipynb
@@ -0,0 +1,161 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "file_name_6=\"data\\\\veneto.txt\"\n",
+    "data_6=pd.read_csv(file_name_6,delimiter=';',header=0,quoting=0,error_bad_lines=False,encoding=\"Latin-1\")\n",
+    "df_6=pd.DataFrame(data_6)\n",
+    "df_6['Origine'] = df_6['Origine'].astype(str).str[:-3]\n",
+    "df_6['Total_Mete'] = df_6['Total_Mete'].astype(str).str[:6]\n",
+    "\n",
+    "\n",
+    "df_6_co=df_6[(df_6['Name'] == \"1272 - 28001\") |\n",
+    "             (df_6['Name'] == \"2158 - 28001\") |\n",
+    "             (df_6['Name'] == \"3106 - 28001\") |\n",
+    "             (df_6['Name'] == \"4078 - 28001\") |\n",
+    "             (df_6['Name'] == \"5005 - 28001\") |\n",
+    "             (df_6['Name'] == \"6003 - 28001\") |\n",
+    "             (df_6['Name'] == \"96004 - 28001\") |\n",
+    "             (df_6['Name'] == \"103072 - 28001\") |\n",
+    "             (df_6['Name'] == \"7003 - 28001\") |\n",
+    "             (df_6['Name'] == \"12133 - 28001\") |\n",
+    "             (df_6['Name'] == \"13075 - 28001\") |\n",
+    "             (df_6['Name'] == \"14061 - 28001\") |\n",
+    "             (df_6['Name'] == \"15146 - 28001\") |\n",
+    "             (df_6['Name'] == \"16024 - 28001\") |\n",
+    "             (df_6['Name'] == \"17029 - 28001\") |\n",
+    "             (df_6['Name'] == \"18110 - 28001\") |\n",
+    "             (df_6['Name'] == \"19036 - 28001\") |\n",
+    "             (df_6['Name'] == \"20030 - 28001\") |\n",
+    "             (df_6['Name'] == \"97042 - 28001\") |\n",
+    "             (df_6['Name'] == \"98031 - 28001\") |\n",
+    "             (df_6['Name'] == \"108033 - 28001\") |\n",
+    "             (df_6['Name'] == \"21008 - 28001\") |\n",
+    "             (df_6['Name'] == \"22205 - 28001\") |\n",
+    "             (df_6['Name'] == \"23091 - 28001\") |\n",
+    "             (df_6['Name'] == \"24116 - 28001\") |\n",
+    "             (df_6['Name'] == \"25006 - 28001\") |\n",
+    "             (df_6['Name'] == \"26086 - 28001\") |\n",
+    "             (df_6['Name'] == \"27042 - 28001\") |\n",
+    "             (df_6['Name'] == \"28060 - 28001\") |\n",
+    "             (df_6['Name'] == \"29041 - 28001\") |\n",
+    "             (df_6['Name'] == \"30129 - 28001\") |\n",
+    "             (df_6['Name'] == \"31007 - 28001\") |\n",
+    "             (df_6['Name'] == \"32006 - 28001\") |\n",
+    "             (df_6['Name'] == \"93033 - 28001\") |\n",
+    "             (df_6['Name'] == \"8031 - 28001\") |\n",
+    "             (df_6['Name'] == \"9056 - 28001\") |\n",
+    "             (df_6['Name'] == \"10025 - 28001\") |\n",
+    "             (df_6['Name'] == \"11015 - 28001\") |\n",
+    "             (df_6['Name'] == \"33032 - 28001\") |\n",
+    "             (df_6['Name'] == \"34027 - 28001\") |\n",
+    "             (df_6['Name'] == \"35033 - 28001\") |\n",
+    "             (df_6['Name'] == \"36023 - 28001\") |\n",
+    "             (df_6['Name'] == \"37006 - 28001\") |\n",
+    "             (df_6['Name'] == \"38008 - 28001\") |\n",
+    "             (df_6['Name'] == \"39014 - 28001\") |\n",
+    "             (df_6['Name'] == \"40012 - 28001\") |\n",
+    "             (df_6['Name'] == \"99014 - 28001\") |\n",
+    "             (df_6['Name'] == \"45010 - 28001\") |\n",
+    "             (df_6['Name'] == \"46017 - 28001\") |\n",
+    "             (df_6['Name'] == \"47014 - 28001\") |\n",
+    "             (df_6['Name'] == \"48017 - 28001\") |\n",
+    "             (df_6['Name'] == \"49009 - 28001\") |\n",
+    "             (df_6['Name'] == \"50026 - 28001\") |\n",
+    "             (df_6['Name'] == \"51002 - 28001\") |\n",
+    "             (df_6['Name'] == \"52032 - 28001\") |\n",
+    "             (df_6['Name'] == \"53011 - 28001\") |\n",
+    "             (df_6['Name'] == \"100005 - 28001\") |\n",
+    "             (df_6['Name'] == \"55032 - 28001\") |\n",
+    "             (df_6['Name'] == \"41044 - 28001\") |\n",
+    "             (df_6['Name'] == \"42002 - 28001\") |\n",
+    "             (df_6['Name'] == \"43023 - 28001\") |\n",
+    "             (df_6['Name'] == \"44007 - 28001\") |\n",
+    "             (df_6['Name'] == \"109006 - 28001\") |\n",
+    "             (df_6['Name'] == \"54039 - 28001\") |\n",
+    "             (df_6['Name'] == \"56059 - 28001\") |\n",
+    "             (df_6['Name'] == \"57059 - 28001\") |\n",
+    "             (df_6['Name'] == \"58091 - 28001\") |\n",
+    "             (df_6['Name'] == \"59011 - 28001\") |\n",
+    "             (df_6['Name'] == \"60038 - 28001\") |\n",
+    "             (df_6['Name'] == \"66049 - 28001\") |\n",
+    "             (df_6['Name'] == \"67041 - 28001\") |\n",
+    "             (df_6['Name'] == \"68028 - 28001\") |\n",
+    "             (df_6['Name'] == \"69022 - 28001\") |\n",
+    "             (df_6['Name'] == \"70006 - 28001\") |\n",
+    "             (df_6['Name'] == \"94023 - 28001\") |\n",
+    "             (df_6['Name'] == \"61022 - 28001\") |\n",
+    "             (df_6['Name'] == \"62008 - 28001\") |\n",
+    "             (df_6['Name'] == \"63049 - 28001\") |\n",
+    "             (df_6['Name'] == \"64008 - 28001\") |\n",
+    "             (df_6['Name'] == \"65116 - 28001\") |\n",
+    "             (df_6['Name'] == \"71024 - 28001\") |\n",
+    "             (df_6['Name'] == \"72006 - 28001\") |\n",
+    "             (df_6['Name'] == \"73027 - 28001\") |\n",
+    "             (df_6['Name'] == \"74001 - 28001\") |\n",
+    "             (df_6['Name'] == \"75035 - 28001\") |\n",
+    "             (df_6['Name'] == \"110001 - 28001\") |\n",
+    "             (df_6['Name'] == \"110002 - 28001\") |\n",
+    "             (df_6['Name'] == \"110009 - 28001\") |\n",
+    "             (df_6['Name'] == \"76063 - 28001\") |\n",
+    "             (df_6['Name'] == \"77014 - 28001\") |\n",
+    "             (df_6['Name'] == \"78045 - 28001\") |\n",
+    "             (df_6['Name'] == \"79023 - 28001\") |\n",
+    "             (df_6['Name'] == \"80063 - 28001\") |\n",
+    "             (df_6['Name'] == \"101010 - 28001\") |\n",
+    "             (df_6['Name'] == \"102047 - 28001\") |\n",
+    "             (df_6['Name'] == \"81021 - 28001\") |\n",
+    "             (df_6['Name'] == \"82053 - 28001\") |\n",
+    "             (df_6['Name'] == \"83048 - 28001\") |\n",
+    "             (df_6['Name'] == \"84001 - 28001\") |\n",
+    "             (df_6['Name'] == \"85004 - 28001\") |\n",
+    "             (df_6['Name'] == \"86009 - 28001\") |\n",
+    "             (df_6['Name'] == \"87015 - 28001\") |\n",
+    "             (df_6['Name'] == \"88009 - 28001\") |\n",
+    "             (df_6['Name'] == \"89017 - 28001\") |\n",
+    "             (df_6['Name'] == \"90064 - 28001\") |\n",
+    "             (df_6['Name'] == \"91051 - 28001\") |\n",
+    "             (df_6['Name'] == \"92009 - 28001\") |\n",
+    "             (df_6['Name'] == \"95038 - 28001\") |\n",
+    "             (df_6['Name'] == \"111009 - 28001\") ]\n",
+    "df_6_co=df_6_co.drop(['Origine', 'Destinazione','Total_Minu'], axis=1)\n",
+    "df_6_co.to_csv (r'data\\\\distances_for_padova.csv', index = False, header=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/Project.ipynb b/Project.ipynb
@@ -0,0 +1,91 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Analysis of Vodafone users' fluxes\n",
+    "\n",
+    "The study of the flux of people inside urban areas is of paramount importance to achieve an optimal understanding of emerging critical issues in the local mobility, and to explore areas of potential improvements in the infrastructures and local transports.<br>\n",
+    "The mobility of users within and toward Padova has been monitored using the data provided by the Vodafone mobile carrier, which provides the information based on the users' connections to the network cells.<br>\n",
+    "The data provided by the carrier encompasses the monitoring of the users connected to the Vodafone network in Padova in a four-month period from February to May of 2018.<br>\n",
+    "To provide statistical insights on the number and the flow of users, the data is aggregated based on the origin and movements of the users by averaging the number of connections during the time of the monitoring.<br>\n",
+    "To further avoid privacy violation issues, all observations with less than 30 units (e.g. day-areas for which  <30 users have contributed) have been discarded and/or merged into dedicated categories (indicated with \"altro\", or \"other\").<br>\n",
+    "\n",
+    "## Datasets \n",
+    "\n",
+    "The data is provided in .csv files.\n",
+    "\n",
+    "* __day_od.csv__: table of user' origins and destinations averaged by the day of the week. The data is provided with details of the month, type of user  (resident in Padova/Italian visitor/foreign visitor), country of provenance, together with the province and comune of the user (if available).\n",
+    "* __distinct_users_day.csv__: table of the number of distinct users by origin. The data is provided with details of the month, type of user  (resident in Padova/Italian visitor/foreign visitor), country of provenance, together with the province and comune of the user (if available).\n",
+    "\n",
+    "The information is stored in the fields according to the following scheme: \n",
+    "\n",
+    "- MONTH: month analyzed\n",
+    "- DOW: day analyzed\n",
+    "- ORIGIN: user' origin area\n",
+    "- DESTINATION: user' destination area\n",
+    "- CUST_CLASS: user type (resident / Italian visitor / foreigner visitor)\n",
+    "- COD_COUNTRY: user' country code (e.g. 222=Italy)\n",
+    "- COD_PRO: user' province code (e.g. 12=Varese) \n",
+    "- PRO_COM: user' comune code (e.g. 12026=Busto Arsizio)\n",
+    "- FLOW: number of movements for given date-time (with a minimum of 30 users)\n",
+    "- VISITORS: overall number of users \n",
+    "\n",
+    "Together with the data files, three lookup tables are provided to allow matching the ISTAT country, province and comune codes to the actual names.\n",
+    "\n",
+    "* __codici_istat_comune.csv__: lookup file containing the _comune_ ISTAT code-names mapping\n",
+    "* __codici_istat_provincia.csv__: lookup file containing the _province_ ISTAT code-names mapping\n",
+    "* __codici_nazioni.csv__: lookup file containing mapping the _country_ code to its name\n",
+    "\n",
+    "Additional information, useful for the study of the flow of users, as the number of inhabitants of each province and the distance between Padova and all other Italian provinces can be extracted based on the data collected by the ISTAT (e.g. http://dati.istat.it/Index.aspx?lang=en&SubSessionId=bcd553a8-280a-4b08-afee-cf7a72fd2ad9, http://www.istat.it/storage/cartografia/matrici_distanze/Veneto.zip)\n",
+    "\n",
+    "## Assignment\n",
+    "\n",
+    "1. Data preparation: the csv files are originated from different sources, hence resulting in differences in the encoding and end-of-lines that have to be taken into account in the data preparation phase. Make sure each .csv file is properly interpreted.\n",
+    "* Ranking of visitors from foreign countries: based on the number of total visitors per each country, create a ranked plot of the first 20 countries with the most visitors\n",
+    "* Ranking of Italian visitors by province, weighted by the number of inhabitants: based on the number of total visitors per Italian province, create a ranked plot of the first 20 provinces with the most visitors taking into account the number of inhabitants.\n",
+    "* Study of the visitors' fluxes: let's assume to be asked to provide indications of how to invest resources to improve the mobility toward Padova. Defined the three main directions of visitors and commuter getting to Padova by the Italian highways (sud: A13 toward Bologna-Roma / west: A4 toward Milano-Torino / north-east: A4 toward Venice-Trieste), evaluate which of the three directions has to be prioritized:\n",
+    " 1. Consider a simplified case involving only the mid-range mobility, based on the number of visitors/commuters from the nearby regions only\n",
+    " * Consider the provinces located on the three directions that are mostly contributing to the flow of weekend visitors and working daily commuters by performing a more detailed study of the fluxes based on the day of the week\n",
+    " * _[OPEN]_ Use the data available to provide what you believe is the best possible answer \n",
+    "* Plot the distribution of the number of visitors by the distance of the province of origin. Determine which kind of law should be used to describe the distribution.\n",
+    " 1. Assuming an analityc form can be used to describe the trend, create a regression to estimate the expected number of visitors by the distance of the province of origin. Illustrate the difference between the resulting regression with respect to the numbers provided by the Vodafone monitoring, and highlight the five most striking discrepancies from the expectations.\n",
+    "* _[OPEN]_ Use all the data available (and additional data from the ISTAT sources if needed) to extract another interesting information concerning the mobility and the flux of visitors and commuters\n",
+    "\n",
+    "### Contacts\n",
+    "\n",
+    "* Marco Zanetti <marco.zanetti@unipd.it>\n",
+    "* Jacopo Pazzini <jacopo.pazzini@unipd.it>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/Project_Group_6.ipynb b/Project_Group_6.ipynb
diff --git a/Report_G6.pdf b/Report_G6.pdf