davidbp
diff --git a/‎python_basics/multiprocessing/joblib/README.md‎
Lines changed: 67 additions & 0 deletions b/‎python_basics/multiprocessing/joblib/README.md‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎python_basics/multiprocessing/joblib/feature_counts_dict_joblib_parallel_and_reduce.py‎
Lines changed: 61 additions & 0 deletions b/‎python_basics/multiprocessing/joblib/feature_counts_dict_joblib_parallel_and_reduce.py‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎python_basics/multiprocessing/joblib/feature_counts_dict_joblib_parallel_and_reduce_in_chunks.py‎
Lines changed: 72 additions & 0 deletions b/‎python_basics/multiprocessing/joblib/feature_counts_dict_joblib_parallel_and_reduce_in_chunks.py‎
Lines changed: 72 additions & 0 deletions
diff --git a/‎python_basics/multiprocessing/joblib/feature_counts_dict_serial.py‎
Lines changed: 34 additions & 0 deletions b/‎python_basics/multiprocessing/joblib/feature_counts_dict_serial.py‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎python_basics/multiprocessing/joblib/other_tests/feature_counts_dict_pool_map_and_reduce.py‎
Lines changed: 73 additions & 0 deletions b/‎python_basics/multiprocessing/joblib/other_tests/feature_counts_dict_pool_map_and_reduce.py‎
Lines changed: 73 additions & 0 deletions
diff --git a/‎python_basics/multiprocessing/joblib/other_tests/feature_counts_dict_pool_map_with_manager.py‎
Lines changed: 61 additions & 0 deletions b/‎python_basics/multiprocessing/joblib/other_tests/feature_counts_dict_pool_map_with_manager.py‎
Lines changed: 61 additions & 0 deletions
@@ -0,0 +1,67 @@
+# Collection of parallelizable examples
+
+
+## Feature counts
+
+Example building a dict that contains word counts. This example showcases three different approaches found in 
+
+```
+feature_counts_dict_serial.py
+feature_counts_dict_joblib_parallel_and_reduce.py
+feature_counts_dict_joblib_parallel_and_reduce_in_chunks.py
+```
+
+
+#### Serial version (bottlenecked by the completly serial operation)
+```
+python feature_counts_dict_serial.py 
+```
+
+```
+num docs = 1131400
+
+time overall  117.3199 sec
+
+len(vocabulary.items())---> 130107
+(vocabulary['from'], vocabulary['gift'])---> (2267000, 6600)
+```
+
+
+#### Parallel version working one element at a time (bottlenecked by the reduce step)
+
+```
+python feature_counts_dict_joblib_parallel_and_reduce.py
+```
+
+```
+num docs = 1131400
+
+time build vocabularies  42.0592 sec
+time aggregate vocabularies  35.4708 sec
+time overall  77.5302 sec
+
+len(partial_vocabularies)---> 1131400
+len(vocabulary.items())---> 130107
+(vocabulary['from'], vocabulary['gift'])---> (2267000, 6600)
+```
+
+
+#### Parallel version working in minibatches 
+
+Note this implementation is not bottlenecked in the parallel part, it has an irrelevant bottlenecked in the reduce step.
+
+```
+python feature_counts_dict_joblib_parallel_and_reduce_in_chunks.py
+```
+
+```
+num docs = 1131400
+
+time build vocabularies  26.7191 sec
+time aggregate vocabularies  0.236 sec
+time overall  26.9552 sec
+
+len(partial_vocabularies)---> 12
+len(vocabulary.items())---> 130107
+(vocabulary['from'], vocabulary['gift'])---> (2267000, 6600)
+```
@@ -0,0 +1,61 @@
+import string
+import random
+import time
+from collections import Counter
+from functools import partial, reduce
+from collections import defaultdict
+from itertools import repeat
+
+import sklearn
+from sklearn import feature_extraction, datasets
+from joblib import Parallel, delayed
+
+from utils import timer, load_data
+
+def aggregate_dicts(dicts):
+
+    if len(dicts) == 1:
+        return dicts
+    
+    else:
+        result = dicts[0]
+
+        for d in dicts[1:]:
+            for k,v in d.items():
+                result[k] +=v
+
+        return result
+
+
+def build_vocabulary(sentence, doc_analyzer):
+    vocabulary = defaultdict(int)
+    words = doc_analyzer(sentence)
+
+    for word in words:
+        vocabulary[word] += 1 
+    return vocabulary
+
+
+if __name__ == '__main__':
+    
+    n_jobs = 10
+
+    factor_multiplier = 100 # This factor ensures 1 million documents in the dataset
+    sentences, _, _, _ = load_data()
+    sentences = sentences * factor_multiplier
+    print(f'num docs = {len(sentences)}\n')
+    
+    count_vectorizer = feature_extraction.text.CountVectorizer()
+    doc_analyzer = count_vectorizer.build_analyzer()
+
+    with timer('overall', indentation=''):
+        with timer('build vocabularies', indentation=''):
+            p_build_vocabulary = partial(build_vocabulary,  doc_analyzer=doc_analyzer)
+            partial_vocabularies = Parallel(n_jobs=n_jobs)(delayed(p_build_vocabulary)(s) for s in sentences)
+
+        with timer('aggregate vocabularies', indentation=''):
+            vocabulary = aggregate_dicts(partial_vocabularies)
+
+    print('\nlen(partial_vocabularies)--->', len(partial_vocabularies))
+    print('len(vocabulary.items())--->', len(vocabulary.items()))
+    print("(vocabulary['from'], vocabulary['gift'])--->", (vocabulary['from'], vocabulary['gift']))
@@ -0,0 +1,72 @@
+import string
+import random
+import time
+from collections import Counter
+from functools import partial, reduce
+
+from collections import defaultdict
+from itertools import repeat
+
+import sklearn
+from sklearn import feature_extraction, datasets
+from joblib import Parallel, delayed
+
+from utils import timer, load_data
+
+def aggregate_dicts(dicts):
+
+    if len(dicts) == 1:
+        return dicts
+    
+    else:
+        result = dicts[0]
+
+        for d in dicts[1:]:
+            for k,v in d.items():
+                result[k] +=v
+        
+        return result
+
+def build_vocabulary(sentences, doc_analyzer):
+    vocabulary = defaultdict(int)
+    for sentence in sentences:
+        words = doc_analyzer(sentence)
+        for word in words:
+            vocabulary[word] += 1 
+
+    return vocabulary
+
+def get_batches(s, n, truncate=False):
+    assert n > 0
+    while len(s) >= n:
+        yield s[:n]
+        s = s[n:]
+    if len(s) and not truncate:
+        yield s
+
+
+if __name__ == '__main__':
+
+    n_jobs = 10
+    chunk_size = 100_000
+    factor_multiplier = 100 # This factor ensures 1 million documents in the dataset
+
+    sentences, _, _, _ = load_data()
+    sentences = sentences * factor_multiplier
+
+    print(f'num docs = {len(sentences)}\n')
+
+    count_vectorizer = feature_extraction.text.CountVectorizer()
+    doc_analyzer = count_vectorizer.build_analyzer()
+
+    with timer('overall', indentation=''):
+        with timer('build vocabularies', indentation=''):
+            p_build_vocabulary = partial(build_vocabulary,  doc_analyzer=doc_analyzer)
+            partial_vocabularies = Parallel(n_jobs=n_jobs)(delayed(p_build_vocabulary)(s) for s in get_batches(sentences, chunk_size))
+
+        with timer('aggregate vocabularies', indentation=''):
+            vocabulary = aggregate_dicts(partial_vocabularies)
+
+    print('\nlen(partial_vocabularies)--->', len(partial_vocabularies))        
+    print('len(vocabulary.items())--->', len(vocabulary.items()))
+    print("(vocabulary['from'], vocabulary['gift'])--->", (vocabulary['from'], vocabulary['gift']))
@@ -0,0 +1,34 @@
+import string
+import random
+
+from collections import defaultdict
+from itertools import repeat
+
+import sklearn
+from sklearn import feature_extraction, datasets
+
+from utils import timer, load_data
+
+def update_vocabulary(sentence, vocabulary, doc_analyzer):
+    words = doc_analyzer(sentence)
+    for word in words:
+        vocabulary[word] +=1 
+
+if __name__ == '__main__':
+
+    factor_multiplier = 100 # This factor ensures 1 million documents in the dataset
+    sentences, _, _, _ = load_data()
+    sentences = sentences * factor_multiplier
+    print(f'num docs = {len(sentences)}\n')
+
+    count_vectorizer = feature_extraction.text.CountVectorizer()
+    doc_analyzer = count_vectorizer.build_analyzer()
+    vocabulary = defaultdict(int)
+    
+    with timer('overall', indentation=''):
+        for s in sentences:
+            update_vocabulary(s, vocabulary, doc_analyzer)
+
+    print('\nlen(vocabulary.items())--->', len(vocabulary.items()))
+    print("(vocabulary['from'], vocabulary['gift'])--->", (vocabulary['from'], vocabulary['gift']))
+
@@ -0,0 +1,73 @@
+import string
+import random
+
+from multiprocessing import Pool
+from collections import defaultdict
+from itertools import repeat
+
+import sklearn
+from sklearn import feature_extraction, datasets
+import time
+from functools import partial, reduce
+from collections import Counter
+
+
+def aggregate_dicts(dicts):
+
+    if len(dicts) == 1:
+        return dicts
+    
+    else:
+        result = dicts[0]
+
+        for d in dicts[1:]:
+            for k,v in d.items():
+                result[k] +=v
+        
+        return result
+
+def build_vocabulary(sentence, doc_analyzer):
+    vocabulary = defaultdict(int)
+    words = doc_analyzer(sentence)
+
+    for word in words:
+        vocabulary[word] += 1 
+    return vocabulary
+
+def load_data():
+
+    X = sklearn.datasets.fetch_20newsgroups()
+
+    X_train = sklearn.datasets.fetch_20newsgroups(subset="train").data
+    y_train = sklearn.datasets.fetch_20newsgroups(subset="train").target
+    X_test  = sklearn.datasets.fetch_20newsgroups(subset="test").data
+    y_test  = sklearn.datasets.fetch_20newsgroups(subset="test").target
+
+    return X_train, y_train, X_test, y_test
+
+
+if __name__ == '__main__':
+
+    n_jobs = 10
+    chunksize = 1000
+
+    factor_multiplier = 100 # This factor ensures 1 million documents in the dataset
+    sentences, _, _, _ = load_data()
+    sentences = sentences * factor_multiplier
+    print(f'num docs = {len(sentences)}')
+
+
+    count_vectorizer = feature_extraction.text.CountVectorizer()
+    doc_analyzer = count_vectorizer.build_analyzer()
+
+    t0 = time.time()
+    pool = Pool(processes=n_jobs)
+
+    p_build_vocabulary = partial(build_vocabulary,  doc_analyzer=doc_analyzer)
+    partial_vocabularies = pool.map(p_build_vocabulary, sentences) 
+    print('len(partial_vocabularies)--->', len(partial_vocabularies))
+    vocabulary = aggregate_dicts(partial_vocabularies)
+
+    print('len(vocabulary.items())--->', len(vocabulary.items()))
+    print("(vocabulary['from'], vocabulary['gift'])--->", (vocabulary['from'], vocabulary['gift']))
+    print(f'time taken {time.time()-t0} seconds')
@@ -0,0 +1,61 @@
+import string
+import random
+
+from multiprocessing import Pool
+from multiprocessing.managers import BaseManager, DictProxy
+from collections import defaultdict
+from itertools import repeat
+
+import sklearn
+from sklearn import feature_extraction, datasets
+import time
+
+class MyManager(BaseManager):
+    pass
+
+MyManager.register('defaultdict', defaultdict, DictProxy)
+
+def update_vocabulary(sentence, manager_vocabulary, doc_analyzer):
+    words = doc_analyzer(sentence)
+    for word in words:
+        manager_vocabulary[word] += 1 
+
+
+def load_data():
+
+    X = sklearn.datasets.fetch_20newsgroups()
+
+    X_train = sklearn.datasets.fetch_20newsgroups(subset="train").data
+    y_train = sklearn.datasets.fetch_20newsgroups(subset="train").target
+    X_test  = sklearn.datasets.fetch_20newsgroups(subset="test").data
+    y_test  = sklearn.datasets.fetch_20newsgroups(subset="test").target
+
+    return X_train, y_train, X_test, y_test
+
+
+if __name__ == '__main__':
+
+    n_jobs = 10
+    chunksize = 100
+
+    factor_multiplier = 100 # This factor ensures 1 million documents in the dataset
+    sentences, _, _, _ = load_data()
+    sentences = sentences * factor_multiplier
+    print(f'num docs = {len(sentences)}')
+
+    count_vectorizer = feature_extraction.text.CountVectorizer()
+    doc_analyzer = count_vectorizer.build_analyzer()
+
+    t0 = time.time()
+    pool = Pool(processes=n_jobs)
+    manager = MyManager()
+    manager.start()
+    manager_vocabulary = manager.defaultdict(int)
+
+    pool.starmap(update_vocabulary, zip(sentences, repeat(manager_vocabulary), repeat(doc_analyzer)), chunksize=chunksize)
+    vocabulary = manager_vocabulary
+
+    print('len(vocabulary.items())--->', len(vocabulary.items()))
+    print("(vocabulary['from'], vocabulary['gift'])--->", (vocabulary['from'], vocabulary['gift']))
+    print(f'time taken {time.time()-t0} seconds')
+