-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbatch_provider.py
More file actions
187 lines (153 loc) · 7.6 KB
/
batch_provider.py
File metadata and controls
187 lines (153 loc) · 7.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
# Copyright 2018-2019 Stanislav Pidhorskyi
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
try:
from Queue import Queue, Empty
except ImportError:
from queue import Queue, Empty
from threading import Thread, Lock, Event
from progress_bar import ProgressBar
def batch_provider(data, batch_size, processor=None, worker_count=1, queue_size=16, report_progress=True):
""" Return an object that produces a sequence of batches from input data
Input data is split into batches of size :attr:`batch_size` which are processed with function :attr:`processor`
Data is split and processed by separate threads and dumped into a queue allowing continuous
provision of data. The main purpose of this primitive is to provide easy to use tool for parallel batch
processing/generation in background while main thread runs the main algorithm.
Batches are processed in parallel, allowing better utilization of CPU cores and disk that may improve
GPU utilization for DL tasks with Storage/IO bottleneck.
This primitive can be used in various ways. For small datasets, the input :attr:`data` list may contain actual
dataset, while :attr:`processor` function does from small to no data processing. For larger datasets, :attr:`data`
list may contain just filenames or keys while :attr:`processor` function reads data from disk or db.
There are many purposes that function :attr:`processor` can be used for, depending on your use case.
- Reading data from disk or db
- Data decoding, e.g. from JPEG.
- Augmenting data, flipping, rotating adding nose, etc.
- Concatenation of data, stacking to single ndarray, conversion to a tensor, uploading to GPU.
- Data generation.
Note:
Sequential order of batches is guaranteed only if number of workers is 1 (Default), otherwise batches might
be supplied out of order.
Args:
data (list): Input data, each entry in the list should be a separate data point.
batch_size (int): Size of a batch. If size of data is not divisible by :attr:`batch_size`, then
the last batch will have smaller size.
processor (Callable[[list], Any], optional): Function for processing batches. Receives slice of the :attr:`data`
list as input. Can return object of any type. Defaults to None.
worker_count (int, optional): Number of workers, should be greater or equal to one. To process data in parallel
and fully load CPU :attr:`worker_count` should be close to the number of CPU cores. Defaults to one.
queue_size (int, optional): Maximum size of the queue, which is number of batches to buffer. Should be larger
than :attr:`worker_count`. Typically, one would want this to be as large as possible to amortize all disk
IO and computational costs. Downside of large value is increased RAM consumption. Defaults to 16.
report_progress (bool, optional): Print a progress bar similar to `tqdm`. You still may use `tqdm` if you set
:attr:`report_progress` to False. To use `tqdm` just do
::
for x in tqdm(batch_provider(...)):
...
Defaults to True.
Returns:
Iterator: An object that produces a sequence of batches. :meth:`next()` method of the iterator will return
object that was produced by :attr:`processor` function
Raises:
StopIteration: When all data was iterated through. Stops the for loop.
Example:
::
def process(batch):
images = [misc.imread(x[0]) for x in batch]
images = np.asarray(images, dtype=np.float32)
images = images.transpose((0, 3, 1, 2))
labeles = [x[1] for x in batch]
labeles = np.asarray(labeles, np.int)
return torch.from_numpy(images) / 255.0, torch.from_numpy(labeles)
data = [('some_list.jpg', 1), ('of_filenames.jpg', 2), ('etc.jpg', 4), ...] # filenames and labels
batches = dlutils.batch_provider(data, 32, process)
for images, labeles in batches:
result = model(images)
loss = F.nll_loss(result, labeles)
loss.backward()
optimizer.step()
"""
class State:
def __init__(self):
self.current_batch = 0
self.lock = Lock()
self.data_len = len(data)
self.batch_count = self.data_len // batch_size + (1 if self.data_len % batch_size != 0 else 0)
self.quit_event = Event()
self.queue = Queue(queue_size)
self.batches_done_count = 0
self.progress_bar = None
if report_progress:
self.progress_bar = ProgressBar(self.batch_count)
def get_next_batch_it(self):
try:
self.lock.acquire()
if self.quit_event.is_set() or self.current_batch == self.batch_count:
raise StopIteration
cb = self.current_batch
self.current_batch += 1
return cb
finally:
self.lock.release()
def push_done_batch(self, batch):
try:
self.lock.acquire()
self.queue.put(batch)
self.batches_done_count += 1
finally:
self.lock.release()
def all_done(self):
return self.batches_done_count == self.batch_count and self.queue.empty()
if processor is None:
def processor(x):
return x
def _worker(state):
while not state.quit_event.is_set():
try:
cb = state.get_next_batch_it()
data_slice = data[cb * batch_size:min((cb + 1) * batch_size, state.data_len)]
b = processor(data_slice)
state.push_done_batch(b)
except StopIteration:
break
class Iterator:
def __init__(self):
self.state = State()
self.workers = []
for i in range(worker_count):
worker = Thread(target=_worker, args=(self.state, ))
worker.daemon = True
worker.start()
self.workers.append(worker)
def __len__(self):
return self.state.batch_count
def __iter__(self):
return self
def __next__(self):
if not self.state.quit_event.is_set() and not self.state.all_done():
item = self.state.queue.get()
self.state.queue.task_done()
if self.state.progress_bar is not None:
self.state.progress_bar.increment()
return item
else:
self.state.quit_event.set()
raise StopIteration
def __del__(self):
self.state.quit_event.set()
while not self.state.queue.empty():
self.state.queue.get(False)
self.state.queue.task_done()
for worker in self.workers:
worker.join()
return Iterator()