-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlearn_generators.py
More file actions
295 lines (227 loc) · 10.4 KB
/
learn_generators.py
File metadata and controls
295 lines (227 loc) · 10.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
import random
from timeit import timeit
# Comment from memory_profiler import profile out when
# running time based memory profiling with mprof
# See: Memory profiler documentation
from memory_profiler import profile
from glob import iglob
import os
import re
from pathlib import Path
import pprint
from collections import defaultdict
def random_iterator(limit):
offset = 0
while offset < limit:
offset += random.random()
yield offset
# # a is a generator object (i.e. an iterable)
# a = random_iterator(10)
#
# # You have to iterate over the iterable to get the values
# print([i for i in a])
def int_gen():
# Infinite sequence of integers
i = 1
while True:
yield i
# This is how i is updated on next
i += 1
def get(n, seq):
"""Returns first n values from the given sequence."""
try:
result = [next(seq) for _ in range(n)]
except StopIteration:
pass
return result
@profile
def pythagorean_triplets():
py_triplets = ((x, y, z) for z in int_gen() for y in range(1, z) for x in range(1, y) if x*x + y*y == z*z)
return get(100, py_triplets)
def read_files(filenames):
# generator
for filename in filenames:
try:
with open(filename) as fp:
if 'build' in filename or 'old' in filename:
continue
line_num = 0
for line in fp:
yield filename.replace('/Users/sthitid/software', '.') + ' L' + str(line_num) + ': ' + line
line_num += 1
except IOError as err:
print(f"Can't open {filename}. Reason: {err}")
def find_pattern(pattern, lines):
# generator expression
# lines = (line for line in lines if pattern in line)
lines = (line for line in lines if re.search(pattern, line, re.IGNORECASE))
return lines
def find_lines_of_len(length, lines):
lines = [line for line in lines if len(line) > length]
return lines
names_todos = defaultdict(list)
names_fixmes = defaultdict(list)
def printlines(lines):
for line in lines:
print(line, end = "")
names = line[line.find('(') + 1: line.find(')')].split(',')
for name in names:
if name in ['calvinq', 'zhuoc', 'tanl', 'jayw', 'boweny', 'chrisa', 'katiap', 'juny', 'qinghual',
'sthitid', 'venkats', 'vineeths', 'tejasn', 'jianz']:
if re.search("TODO", line, re.IGNORECASE):
names_todos[name].append(line)
if re.search("FIXME", line, re.IGNORECASE):
names_fixmes[name].append(line)
for name, task_list in names_todos.items():
print(f'{name} = {len(task_list)} | {task_list}')
for name, task_list in names_fixmes.items():
print(f'{name} = {len(task_list)} | {task_list}')
def printdict(line):
names_tasks = defaultdict(list)
name = line[line.find('(')+1: line.find(')')]
names_tasks[name].append(line)
print(names_tasks)
def find_pattern_in_files(pattern, filenames):
lines = read_files(filenames)
lines = find_pattern(pattern, lines)
printlines(lines)
def find_lines_of_len_in_files(len, filenames):
lines = read_files(filenames)
lines = find_lines_of_len(len, lines)
printlines(lines)
def gen_paths_of_files_in(dir):
# files = (str(Path(file)) for root, dir, file in os.walk(dir))
# pprint.pprint(next(file) for file in files))
for dirName, subdirList, fileList in os.walk(dir):
#print('Found directory: %s' % dirName)
for fname in fileList:
yield os.path.abspath('/'.join([dirName, fname]))
# The print is executed when the generator object is iterated over in the calling function
#print('\t%s' % fname)
def print_file_names(filenames):
for fname in filenames:
print('\t%s' % fname)
# This yields a generator, why?
# print('\t %s' % fname for fname in filenames)
@profile
def find_paths_of_files_in(dir):
filenames = gen_paths_of_files_in(dir)
[filename for filename in filenames]
def gen_python_files(filenames):
for filename in filenames:
# this, or, use filename.endswith('.py')
name, ext = os.path.splitext(filename)
if ext == ".py":
yield filename
def print_python_files(pyfiles):
for pyfile in pyfiles:
print(pyfile)
def find_python_files_in(dir):
filenames = gen_paths_of_files_in(dir)
pyfiles = gen_python_files(filenames)
print_python_files(pyfiles)
def gen_loc_in_python_files(pyfiles):
for pyfile in pyfiles:
try:
with open(pyfile, 'r') as fp:
for _ in fp:
yield 1
except IOError:
print(pyfile + " not found")
def get_loc_in_python_files_in(dir):
filenames = gen_paths_of_files_in(dir)
pyfiles = gen_python_files(filenames)
loc = sum(gen_loc_in_python_files(pyfiles))
print(loc)
def gen_loc_without_comments_or_empty_lines_in_python_files(pyfiles):
for pyfile in pyfiles:
try:
with open(pyfile, 'r') as fp:
for line in fp:
if line: # filter out empty lines
if line.startswith('#'):
continue #filter out comments
yield 1
except IOError:
print(pyfile + " not found")
def get_loc_without_comments_or_empty_lines_in_python_files_in(dir):
filenames = gen_paths_of_files_in(dir)
pyfiles = gen_python_files(filenames)
loc = sum(gen_loc_without_comments_or_empty_lines_in_python_files(pyfiles))
print(loc)
def get_file_name(fp, n):
i = 0
if i < n:
file_name = fp.name + "_" + i//n
yield file_name
def write_n_lines(filename, n_lines):
name, ext = os.path.splitext(filename)
new_file_name = f"{name}_{write_n_lines.i}{ext}"
with open(new_file_name, 'w') as write_fp:
for line in n_lines:
write_fp.write(line)
write_n_lines.i += 1
def split_into_n_files(file, n):
write_n_lines.i = 0
with open(file, 'r') as read_fp:
#while read_fp: # This becomes an infinite loop in python because read_fp has the
# byte position it is pointing to, and not the value, so no EOF detection.
# The correct check is while read_fp.read() or read_fp.readline()
# But when you do this, the current line or all of the next bytes
# get eaten in the check, so use tell and seek to set the file pointer
# in the right location
try:
curr_pos = read_fp.tell()
data = read_fp.readline()
while data:
read_fp.seek(curr_pos)
print(f"Start: {read_fp.tell()}")
n_lines = (read_fp.readline() for _ in range(n))
write_n_lines(file, n_lines)
print(f"End: {read_fp.tell()}")
curr_pos = read_fp.tell()
data = read_fp.readline()
except IOError:
print("Oops!")
def delete_file_gen(pattern):
for file in iglob(pattern):
if os.path.exists(file):
print(f"Deleting {file} ...")
os.remove(file)
if __name__ == '__main__':
# print(f"Pythagorean triplets: {timeit(pythagorean_triplets, number=10)} seconds")
# print(pythagorean_triplets())
# Anandology Problem 1: takes a list of filenames as arguments and prints
# only the line which has a particular substring
# find_pattern_in_files("dshim", ["./data/DDRStat.out", "./data/ddr_stat_working.out"])
# Find ToDos in codebase
for pattern in ["TODO", "FIXME"]:
print(f"\n\nPending {pattern}\n\n")
find_pattern_in_files(pattern,
iglob("/Users/sthitid/software/sambaflow/**/*.py", recursive = True))
# Anandology Problem 2: Write a program that takes one or more filenames as arguments and prints
# all the lines which are longer than 40 characters.
# find_lines_of_len_in_files(40, iglob("/Users/sthitid/Desktop/*.txt", recursive=False))
# Anandology Problem 3: Write a function findfiles that recursively descends the directory tree for
# the specified directory and generates paths of all the files in the tree.
# find_paths_of_files_in('/Users/sthitid/Desktop')
# Anandology Problem 4: Write a function to compute the number of python files (.py extension)
# in a specified directory recursively.
# find_python_files_in('/Users/sthitid/Documents/my_python_projects')
# Anandology Problem 5: Write a function to compute the total number of lines of code in
# all python files in the specified directory recursively.
# get_loc_in_python_files_in('/Users/sthitid/Documents/my_python_projects')
# Anandology Problem 6: Write a function to compute the total number of lines of code, ignoring empty and
# comment lines, in all python files in the specified directory recursively.
# get_loc_without_comments_or_empty_lines_in_python_files_in('/Users/sthitid/Documents/my_python_projects')
# Anandology Problem 7: Write a program split.py, that takes an integer n and a filename as command
# line arguments and splits the file into multiple small files with each having n lines.
# split_into_n_files("/Users/sthitid/Documents/my_python_projects/data/DDRStat.out", 100)
# LEARNING 1: The thing about generators is they are pipelines - set them up for a single element and the final
# consumer (for loop or print) will drive the pipeline for all the elements in the dataset
# You can build data pipelines with multiple generators
# LEARNING 2: None of this understanding about generators prepared me for the task of deleting 1.8 million files
# that got generated from a wrong solution to Problem 7. above. I tried bash scripts which I just couldn't get to
# work, then I tried a python script with generator, which I just couldn't iterate over, only to realize
# that iglob returns a generator, and all I needed to do is to iterate over iglob (i in iglob stands for iterator!)
# delete_file_gen("/Users/sthitid/Documents/my_python_projects/data/DDRStat_*.out")