data8/Data8 syntax.py at master · gituxedo/data8 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# Common Python operators
1 + 2  # 3		Addition
1 - 2  # -1		Subtraction
1 * 2  # 2		Multiplication
1 / 2  # 0.5	Division
1 % 2  # 1		Modulo
1 ** 2 # 1		Exponentiate
# Division is not integer division
# follow PEMDAS precedence

# Assignments
a = 10
b = 2 * a
a + b # 30

# Import math for fun things
# https://docs.python.org/3/library/math.html
import math
math.log(2, 16) 				# 0.25
math.sqrt(operator.add(4, 5)) 	# 3.0

# Types
# int, float, string, boolean
# Common Python comparisons
2 < 3	# True		less than
3 > 2	# True		greater than
2 <= 2	# True		less than or equal
3 >= 3	# True		greater than or equal
3 == 3	# True		equal
3 != 2	# True		not equal

# Arrays (numpy), sequences, lists
import numpy as np
english_parts_of_speech = make_array("noun", "pronoun", "verb", "adverb", "adjective", "conjunction", "preposition", "interjection")
# https://www.inferentialthinking.com/chapters/05/1/Arrays
# entrywise operations: can just 'add' or 'multiply' normally
# diff, sum, sort most useful
# np.arange([start, end), skip)
np.arange(1.5, -2, -0.5)
# instantiate numpy array. Can convert table rows to arrays
np.array(row)

# Tables
from datascience import *
Table() 			# blank table
.with_columns("Column title", make_array("array", "of", "stuff"),
			  "Next column", make_array(420, 23547648, 343))
# read from existing csv
.read_table(path_data + 'table_title.csv')
.show(n) 		# shows first n rows, PURELY cosmetic
.num_columns
.num_rows
.labels 		# list of column labels
.column('Column title')	# retrieves column of this name
.column(3)				# or index, starting at 0
.column(0).item(4)		# retrieves entry in index 4 of column 0
# format can be DateFormatter, CurrencyFormatter, PercentFormatter
.set_format('New Name', PercentFormatter)

# makes new table with selected columns
# DIFFERENT FROM .column
.select('col1 name', 'col2 name', etc)
.select(start, end index)
# makes new table with some columns dropped
.drop('col1 name', 'col2 name', etc)
# sorts table
.sort('column name', descending=False, distinct=False)
# makes new table with specified range of row(s)
.take(n) # takes row of index n
.take(np.arange(4, 19))
# applies a function to each element of a column
.apply(func_name, "column name"[, "column2 name", etc])

# CONDITIONAL SELECTION: *WHERE*
.where('column name', are.equal_to(x))
						 .above(x)
						 .above_or_equal_to(x)
						 .below(x)
						 .below_or_equal_to(x)
						 .between(x, y)
						 .strictly_between(x, y)
						 .between_or_equal_to(x, y)
						 .containing(S)
# Can negate any of these, e.g.
						 .not_above(x)

# Visualizations
sample = Table().read_table(path_data + 'sample.csv')
# scatter plot, table can be modified before plotting
sample.scatter('x axis label', 'y axis label')
# line graph
sample.plot('x axis label', 'y axis label')
# bar chart (horizontal)
sample.barh('category axis label', 'quantity label')
# histogram (vertical)
sample.hist('column'[, bins=np.arange(start, end, size), unit='axes unit'])
# histogram: makes table with count in each bin
bins = sample.bin('column', bins=np.arange(start, end, size))
# histogram: construct from bin table
sample.hist('x axis label', bin_column='bins', unit='Million Dollars')

# vertical axis uses DENSITY SCALE: bar height = %entries relative to bin size
# https://www.inferentialthinking.com/chapters/07/2/Visualizing_Numerical_Distributions#the-histogram-general-principles-and-calculation
# area of bar = % entries in bin
# 			  = bar height * bin width
# bar height = bar area/bin width = % entries in bin/bin width

# can use unequal bins to construct histogram
uneven_bins = make_array(50, 100, 200, 420, 500, 1337)

# overlay multiple graphs of the same type (for scatter, plot, barh)
# edit table first! plots all other columns against 'col label'
sample.barh('col label')


# Functions
def func_name(arg1, arg2):
	"""Adds arg1 and arg2"""
	return arg1+arg2

# Aggregations
# groups/aggregates information in col1, col2 according to groupBy_function
# 	-default function is count
# 	-returns 2-column table: | 'col1 name' | groupedByFunction |
.group('col1 name'[, groupBy_function])
# creates new table where sample is augmented by table2 column data
.join('col1 for joining', table2, 'table2col for joining')
# group but for multiple columns
.pivot('col labels', 'row labels'[, values='count column', collect=groupBy_function])

# Random variables and probabilities
# returns random entry in an nparray
np.random.choice(nparray[, number_of_trials])
# (non-mutative) appends value to array, returns new array
np.append(array, value)
# returns # of nonzero (True) elements in an array
np.count_nonzero(array)
# returns item at index i in an array
array.item(i)
# returns corresponding percentile value of an array
percentile(percentile, array)

# Sampling methods
# simple random sample from a table, default is the entire table
.sample(sample_size[, with_replacement=True])
# returns array of proportions given buckets summing to 1
sample_proportions(sample_size, distribution_array)

# Miscellaneous
# returns array of all 1's (floats)
np.ones((size_tuple)[, dtype=float])
# returns array of all 0's (floats)
np.zeros((size_tuple)[, dtype=float])
# returns mean of array
np.mean(array)
# returns standard deviation of array
np.std(array)
# area under normal curve with mean=0 and std=1
from scipy import stats
stats.norm.cdf(z_score)