-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathexternal_sort.py
More file actions
104 lines (80 loc) · 2.81 KB
/
external_sort.py
File metadata and controls
104 lines (80 loc) · 2.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import numpy as np
import shutil
import os
from tqdm import tqdm
# buffer size is the max number your memory can handle, set a small number in the beginning for trial here
buffer_size = 1000000
# total size of the number that needs to be sorted, only needed if you want to generate a file of unsorted integers
total_size = 55000
def save_array_to_file(file_name, array_to_save):
np.savetxt(file_name, array_to_save, fmt = '%d')
def sort_and_write(file_name, array_to_sort):
array_to_sort.sort()
save_array_to_file(file_name, array_to_sort)
def read_n_int(file_, numbers_to_read):
array_ = []
if numbers_to_read <= 0 :
return array_
num = file_.readline()
try:
int(num)
except ValueError:
num = file_.readline()
while(num):
array_.append(int(num))
if len(array_) >= numbers_to_read:
break
num = file_.readline()
return array_
def create_unsorted_file(size_, file_name_ = 'unsorted.csv'):
arr = np.arange(size_)
np.random.shuffle(arr)
save_array_to_file(file_name_, arr)
arr = None
import heapq
def sort_slices(file_name, buffer_size_, id_count):
read_arr = []
chunk = 1
f = open(file_name)
if os.path.exists('./tmp/'):
shutil.rmtree('./tmp/')
os.mkdir('./tmp/')
print("Sorting file (Step 1)")
with tqdm(total=id_count+1) as bar:
read_arr = read_n_int(f, buffer_size_)
while (len(read_arr) > 0):
sort_and_write('./tmp/sorted_' + str(chunk), read_arr)
read_arr = read_n_int(f, buffer_size_)
chunk = chunk + 1
bar.update(buffer_size_)
f.close()
def min_heap_sort(output_file, id_count):
sorted_file = open(output_file, 'w+')
min_heap = []
heapq.heapify(min_heap)
print('Sorting file (Step 2)')
open_files = []
for f in tqdm(os.listdir('./tmp/')):
if os.path.isfile('./tmp/' + f):
file_ = open('./tmp/' + f)
open_files.append(file_)
val = file_.readline()
heapq.heappush(min_heap, (int(val), file_))
print('Sorting file (Step 3)')
with tqdm(total=id_count) as bar:
while(len(min_heap) > 0):
min_element = heapq.heappop(min_heap)
sorted_file.write(str(min_element[0]) + '\n')
next_str = min_element[1].readline()
if next_str:
heapq.heappush(min_heap, (int(next_str), min_element[1]))
else:
min_element[1].close()
bar.update(1)
for file in os.listdir('./tmp/'):
os.remove('./tmp/' + file)
os.rmdir('./tmp/')
sorted_file.close()
def external_sort(input_file, output_file, id_count, buffer_size_=buffer_size):
sort_slices(input_file, buffer_size_, id_count)
min_heap_sort(output_file, id_count)