Python-Programming-for-Data-Science/4.2.Data_Analysis_with_Python__Pandas.py at main · HaleGurpinar/Python-Programming-for-Data-Science · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
# Pandas is a Python library used for working with data sets.
# It has functions for analyzing, cleaning, exploring, and manipulating data.
# The name "Pandas" has a reference to both "Panel Data", and "Python Data Analysis".

# Pandas Series: A Pandas Series is like a column in a table, It is a one-dimensional array holding data of any type.
import pandas as pd

print(pd.Series([25, 10, 12, 4, 5]))
s = pd.Series([25, 10, 12, 4, 5])
print(type(s))
print(s.index)
print("PSeries dtype: ", s.dtype, "Size: ", s.size, "Dimension:", s.ndim)
print(s.values)  # Return NumPy Array
print(s.head(3))
print(s.tail(3))

# Pandas DataFrames: Data sets in Pandas are usually multidimensional tables, called DataFrames.
# Series is like a column, a DataFrame is the whole table.

# Reading Data
import pandas as pd

df = pd.read_csv("Dataset/advertising.csv")  # To read another type file: Ctrl+pd click,Ctrl+F search with "read" key.
print(df.head())

# Pandas CheatSheet ***

# Quick Look at Data
import seaborn as sns

df = sns.load_dataset("titanic")
print("Head", df.head(), "Tail", df.tail())
print("Dimension Info.:", df.shape)
print("Info.", df.info)
print("Index: ", df.index)
print("Columns", df.columns)
print(df.describe().T)  # Return statistics (T means transpose.It used for clear output)

print(df.isnull().values.any())  # Is there any(at least one) null value in dataframe? ******
# Are there any missing values in the data frame?

print(df.isnull().sum())  # Return that each variable's missing values count

print(df["sex"].value_counts())
print(df["alive"].value_counts())

# Selection in Pandas****
import seaborn as sns

df = sns.load_dataset("titanic")
print(df.index)
print(df[0:13])

df.drop(0, axis=0).head()  # Delete 0. index, axis=0(rows) -> Delete 1st row
delete_index = [1, 2, 3, 7]
df.drop(delete_index, axis=0).head(10)  # Delete arrays index rows from dataframe.This is not a permanent way to delete.

# df = df.drop(delete_index, axis=0)          # Permanent Delete
# df.drop(delete_index, axis=0, inplace=True) # Permanent Delete

# Convert Variable to Index***
df["age"].head()
df.age.head()

print(df.index)
df.index = df["age"]
print(df.index)
print(df.drop("age", axis=1).head())  # Delete "age" index so axis=1(columns). Temporary Delete
print(df.drop("age", axis=1, inplace=True))  # Permanent Delete from Dataframe

# 1.) Convert Index to Variable***
# df["age"]  # KeyError(key) because deleted before "age
print(df)
df["age"] = df.index  # Create "age" named variable and assign indexes values.
print(df.head())

# 2.) Convert Index to Variable***
print(df.drop("age", axis=1, inplace=True))
print(df.reset_index().head())  # Delete value in index and add as a new variable(as a column)
df = df.reset_index()
print(df.head())

# Operations with Variables(Columns)
import pandas as pd
import seaborn as sns

pd.set_option('display.max_columns', None)  # Get rid of 3 dots(...). Generally not prefer in dataset has many columns.
df = sns.load_dataset("titanic")

print("age" in df)  # Dataframe include "age" variable?
print(type(df["age"].head()))  # pandas.core.series.Series

# ***!!! [[]] keeps your data as a DataFrame***
print(type(df[["age"]].head()))  # pandas.core.frame.DataFrame

print(df[["age", "alive"]])
col_names = ["age", "alive"]
print(df[col_names])  # same output with [[]]

df["age2"] = df["age"] ** 2  # Add as a new name and new values
print(df.head())
df["age3"] = df["age"] / df["age2"]
print(df)

print(df.drop("age3", axis=1).head())  # Delete age3 column
print(df.drop(col_names, axis=1).head())  # Delete col_names values variables

print(df.loc[:, df.columns.str.contains("age")].head())  # Columns names have "age" str, loc used selection from df
print(df.loc[:, ~df.columns.str.contains("age")].head())  # # Columns names have not "age" str

# iloc(integer based selection) & loc(label based selection)***
print(df.iloc[0:3])  # 0,1,2
print(df.loc[0:3])  # 0,1,2,3

col_names = ["age", "embarked", "alive"]
print(df.loc[0:3, col_names])  # 0,1,2,3 rows "age", "embarked", "alive" columns contain str+float**

# Conditional Selection
import pandas as pd
import seaborn as sns

pd.set_option('display.max_columns', None)  # Get rid of 3 dots(...). Generally not prefer in dataset has many columns.
df = sns.load_dataset("titanic")
print(df.head())

print(df[df["age"] > 50].head())  # Age > 50 people from dataset
print(df[df["age"] > 50]["age"].count())  # count Age > 50 people from dataset

print(df.loc[df["age"] > 50, "class"].head())
print(df.loc[df["age"] > 50, ["age", "class"]].head())

# !!!! If there ara more than 1 condition, parenthesis all conditions statements***
print(df.loc[(df["age"] > 50) & (df["sex"] == "male"), ["age", "class"]].head())

print(df.loc[(df["age"] > 50)
             & (df["sex"] == "male")
             & (df["embark_town"] == "Cherbourg"), ["age", "class", "embark_town"]].head())  # 3 conditions

print(df.loc[(df["age"] > 50)
             & (df["sex"] == "male")
             & ((df["embark_town"] == "Cherbourg") | (df["embark_town"] == "Southampton")),
["age", "class", "embark_town"]].head())  # 3 conditions

print(df["embark_town"].value_counts())

# Aggregation & Grouping
import pandas as pd
import seaborn as sns

pd.set_option('display.max_columns', None)  # Get rid of 3 dots(...). Generally not prefer in dataset has many columns.
df = sns.load_dataset("titanic")

print(df.groupby("sex")["age"].mean())  # Mean age by sex
print(df.groupby("sex").agg({"age": "mean"}))
print(df.groupby("sex").agg({"age": ["mean", "sum", "max"]}))
print(df.groupby("sex").agg({"age": ["mean", "sum"], "survived": "mean"}))
print(df.groupby(["sex", "embark_town"]).agg({"age": ["mean"], "survived": "mean"}))  # 2 level grouping
print(df.groupby(["sex", "embark_town", "class"]).agg({"age": ["mean"], "survived": "mean"}))
print(df.groupby(["sex", "embark_town", "class"]).agg({
    "age": ["mean"],
    "survived": "mean",
    "sex": "count"}))

# Pivot Table
import pandas as pd
import seaborn as sns

pd.set_option('display.max_columns', None)  # Get rid of 3 dots(...). Generally not prefer in dataset has many columns.
pd.set_option('display.width', 500)  # Table ***
df = sns.load_dataset("titanic")

print(df.pivot_table("survived", "sex", "embarked"))  # pivot_table(values, row-index, column)***default value ops: mean
print(df.pivot_table("survived", "sex", "embarked", aggfunc="std"))  # Standard Deviation
print(df.pivot_table("survived", "sex", ["embarked", "class"]))  # 2 level

# !!! cut and qcut functions are commonly used for converting numerical variables to categorical***
# If you know(age: 0-3 baby, 3-4 toddler, 18 adult) which categories you want to divide numeric values into,
# use the cut function. Don't kow use qcut function.***
df["new_age"] = pd.cut(df["age"], [0, 10, 18, 25, 40, 90])
print(df.head())

print(df.pivot_table("survived", "sex", "new_age"))
print(df.pivot_table("survived", "sex", ["new_age", "class"]))

# Apply(Apply a particular function to rows and columns) & Lambda(Small Anonym Functions)
# A lambda function can take any number of arguments, but can only have one expression. lambda arguments : expression
import pandas as pd
import seaborn as sns

pd.set_option('display.max_columns', None)  # Get rid of 3 dots(...). Generally not prefer in dataset has many columns.
pd.set_option('display.width', 500)  # Table ***
df = sns.load_dataset("titanic")

df["age2"] = df["age"] * 2
df["age3"] = df["age"] * 5

# ### EX.:  Divide by 10 all values that contain "age" str
# 1.) Normal
print((df["age"] / 10).head())
print((df["age2"] / 10).head())
print((df["age3"] / 10).head())
# 2.) Loop             --> Permanent
for col in df.columns:
    if "age" in col:
        df[col] = df[col] / 10
print(df.head())
# 3.)Apply & Lambda    ---> Use lambda functions when an anonymous function is required for a short period of time.
print(df[["age", "age2", "age3"]].apply(lambda x: x / 10).head())
print(df.loc[:, df.columns.str.contains("age")].apply(lambda x: x / 10).head())  # Programmatic Way

# Ex: Subtract mean from all values then  divide by std all values that contain "age" str
print(df.loc[:, df.columns.str.contains("age")].apply(lambda x: (x - x.mean()) / x.std()).head())  # 1.


def standard_scaler(col_name):
    return (col_name - col_name.mean()) / col_name.std()


print(df.loc[:, df.columns.str.contains("age")].apply(standard_scaler).head())  # 2. We can use functions with apply()

# permanent ways
df.loc[:, ["age", "age2", "age3"]] = df.loc[:, df.columns.str.contains("age")].apply(standard_scaler)
df.loc[:, df.columns.str.contains("age")] = df.loc[:, df.columns.str.contains("age")].apply(standard_scaler)


# Join Operations
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)  # Get rid of 3 dots(...). Generally not prefer in dataset has many columns.
pd.set_option('display.width', 500)  # Table ***
df = sns.load_dataset("titanic")

m = np.random.randint(1, 30, size=(5, 3))
df1 = pd.DataFrame(m, columns=["var1", "var2", "var3"])  # Create DataFrame from 0. m:enter data struct., columns:names
df2 = df1 + 99
# print(df1, "\n", df2)
print(pd.concat([df1, df2]))
print(pd.concat([df1, df2], ignore_index=True))  # Join dataframes

df1 = pd.DataFrame({'employees': ['john', 'dennis', 'mark', 'maria'],
                    'group': ['accounting', 'engineering', 'engineering', 'hr']})

df2 = pd.DataFrame({'employees': ['mark', 'john', 'dennis', 'maria'],
                    'start_date': [2010, 2009, 2014, 2019]})

print(pd.merge(df1, df2))
print(pd.merge(df1, df2, on="employees"))

# ÊX: Access every employees' manager

df3 = pd.merge(df1, df2)
df4 = pd.DataFrame({'group': ['accounting', 'engineering', 'hr'],
                    'manager': ['Tom', 'Rick', 'William']})
print(pd.merge(df3, df4))