davidbp
diff --git a/‎python_basics/multiprocessing/01_map_reduce_threads.py‎
Lines changed: 78 additions & 0 deletions b/‎python_basics/multiprocessing/01_map_reduce_threads.py‎
Lines changed: 78 additions & 0 deletions
diff --git a/‎python_basics/multiprocessing/02_map_shared_dict_threads.py‎
Lines changed: 65 additions & 0 deletions b/‎python_basics/multiprocessing/02_map_shared_dict_threads.py‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎python_basics/multiprocessing/03_map_shared_dict_multiprocessing.py‎
Lines changed: 91 additions & 0 deletions b/‎python_basics/multiprocessing/03_map_shared_dict_multiprocessing.py‎
Lines changed: 91 additions & 0 deletions
diff --git a/‎python_basics/multiprocessing/04_map_reduce_multiprocessing.py‎
Lines changed: 68 additions & 0 deletions b/‎python_basics/multiprocessing/04_map_reduce_multiprocessing.py‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎python_basics/multiprocessing/05_map_reduce_local_dict_threads.py‎
Lines changed: 71 additions & 0 deletions b/‎python_basics/multiprocessing/05_map_reduce_local_dict_threads.py‎
Lines changed: 71 additions & 0 deletions
@@ -0,0 +1,78 @@
+import string
+import time
+from collections import defaultdict
+from functools import partial
+from multiprocessing.dummy import Pool as ThreadPool
+
+import sklearn
+from sklearn import feature_extraction, datasets
+
+class timer():
+
+    def __init__(self, name = '', indentation=''):
+        self.start = time.time()
+        self.name = name
+        self.indentation = indentation
+
+    def __enter__(self):
+        pass
+
+    def __exit__(self, *args):
+        print(self.indentation + f'time {self.name}  {round(time.time() - self.start, 4)} sec')
+
+
+def load_data():
+
+    X = sklearn.datasets.fetch_20newsgroups()
+
+    X_train = sklearn.datasets.fetch_20newsgroups(subset="train").data
+    y_train = sklearn.datasets.fetch_20newsgroups(subset="train").target
+    X_test  = sklearn.datasets.fetch_20newsgroups(subset="test").data
+    y_test  = sklearn.datasets.fetch_20newsgroups(subset="test").target
+
+    return X_train, y_train, X_test, y_test
+
+
+
+# Basic tokenizer: lowercases, strips punctuation, and splits on whitespace
+def simple_tokenizer(text):
+    translator = str.maketrans('', '', string.punctuation)
+    return text.lower().translate(translator).split()
+
+def aggregate_dicts(dicts):
+    if len(dicts) == 1:
+        return dicts[0]
+
+    result = dicts[0]
+    for d in dicts[1:]:
+        for k, v in d.items():
+            result[k] += v
+    return result
+
+def build_vocabulary(sentence):
+    vocabulary = defaultdict(int)
+    words = simple_tokenizer(sentence)
+    for word in words:
+        vocabulary[word] += 1
+    return vocabulary
+
+if __name__ == '__main__':
+    n_jobs = 10
+    factor_multiplier = 20  # Ensures 500k million documents
+
+    sentences, _, _, _ = load_data()
+    sentences = sentences * factor_multiplier
+    print(f'num docs = {len(sentences)}\n')
+
+    with timer('overall', indentation=''):
+        with timer('build vocabularies', indentation=''):
+            with ThreadPool(n_jobs) as pool:
+                partial_vocabularies = pool.map(build_vocabulary, sentences)
+
+        with timer('aggregate vocabularies', indentation=''):
+            vocabulary = aggregate_dicts(partial_vocabularies)
+
+    print('\nlen(partial_vocabularies)--->', len(partial_vocabularies))
+    print('len(vocabulary.items())--->', len(vocabulary.items()))
+    print("(vocabulary['from'], vocabulary['gift'])--->", (vocabulary['from'], vocabulary['gift']))
+
@@ -0,0 +1,65 @@
+import string
+import time
+from collections import defaultdict
+from threading import Lock
+from multiprocessing.dummy import Pool as ThreadPool
+
+import sklearn
+from sklearn import datasets
+
+
+class timer:
+    def __init__(self, name='', indentation=''):
+        self.start = time.time()
+        self.name = name
+        self.indentation = indentation
+
+    def __enter__(self):
+        pass
+
+    def __exit__(self, *args):
+        print(self.indentation + f'time {self.name}  {round(time.time() - self.start, 4)} sec')
+
+
+def load_data():
+    X_train = datasets.fetch_20newsgroups(subset="train").data
+    y_train = datasets.fetch_20newsgroups(subset="train").target
+    X_test = datasets.fetch_20newsgroups(subset="test").data
+    y_test = datasets.fetch_20newsgroups(subset="test").target
+    return X_train, y_train, X_test, y_test
+
+
+# Basic tokenizer: lowercases, strips punctuation, and splits on whitespace
+def simple_tokenizer(text):
+    translator = str.maketrans('', '', string.punctuation)
+    return text.lower().translate(translator).split()
+
+
+# Shared resources for all threads
+shared_vocabulary = defaultdict(int)
+vocab_lock = Lock()
+
+
+def update_shared_vocabulary(sentence):
+    words = simple_tokenizer(sentence)
+    with vocab_lock:
+        for word in words:
+            shared_vocabulary[word] += 1
+
+
+if __name__ == '__main__':
+    n_jobs = 10
+    factor_multiplier = 20  # Ensures ~200k documents
+
+    sentences, _, _, _ = load_data()
+    sentences = sentences * factor_multiplier
+    print(f'num docs = {len(sentences)}\n')
+
+    with timer('overall', indentation=''):
+        with timer('build vocabularies (shared dict)', indentation=''):
+            with ThreadPool(n_jobs) as pool:
+                pool.map(update_shared_vocabulary, sentences)
+
+    print('\nlen(vocabulary.items())--->', len(shared_vocabulary.items()))
+    print("(vocabulary['from'], vocabulary['gift'])--->", (shared_vocabulary['from'], shared_vocabulary['gift']))
+
@@ -0,0 +1,91 @@
+import string
+import time
+from collections import defaultdict
+from multiprocessing import Pool, Manager, Lock
+
+import sklearn
+from sklearn import datasets
+
+
+class timer:
+    def __init__(self, name='', indentation=''):
+        self.start = time.time()
+        self.name = name
+        self.indentation = indentation
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args):
+        print(self.indentation + f'time {self.name}  {round(time.time() - self.start, 4)} sec')
+
+
+def load_data():
+    X_train = datasets.fetch_20newsgroups(subset="train").data
+    y_train = datasets.fetch_20newsgroups(subset="train").target
+    X_test = datasets.fetch_20newsgroups(subset="test").data
+    y_test = datasets.fetch_20newsgroups(subset="test").target
+    return X_train, y_train, X_test, y_test
+
+
+# Tokenizer: lowercase, remove punctuation, split
+def simple_tokenizer(text):
+    translator = str.maketrans('', '', string.punctuation)
+    return text.lower().translate(translator).split()
+
+
+# This function is called by each process
+def update_shared_vocabulary_old(sentence, shared_dict, lock):
+    local_counts = defaultdict(int)
+    for word in simple_tokenizer(sentence):
+        local_counts[word] += 1
+
+    with lock:
+        for word, count in local_counts.items():
+            shared_dict[word] = shared_dict.get(word, 0) + count
+
+
+def update_shared_vocabulary(sentence, shared_dict, lock):
+    # Local count first — no locking needed
+    local_counts = defaultdict(int)
+    for word in simple_tokenizer(sentence):
+        local_counts[word] += 1
+
+    # Only lock once to update the shared dict
+    with lock:
+        for word, count in local_counts.items():
+            shared_dict[word] = shared_dict.get(word, 0) + count
+
+def init_process(shared_dict_, lock_):
+    global shared_dict
+    global lock
+    shared_dict = shared_dict_
+    lock = lock_
+
+
+def wrapper(sentence):
+    update_shared_vocabulary(sentence, shared_dict, lock)
+
+
+if __name__ == '__main__':
+    n_jobs = 8  # or os.cpu_count()
+    factor_multiplier = 20  # ~500k documents
+
+    sentences, _, _, _ = load_data()
+    sentences = sentences * factor_multiplier
+    print(f'num docs = {len(sentences)}\n')
+
+    with timer('overall', indentation=''):
+        with Manager() as manager:
+            shared_dict = manager.dict()
+            lock = manager.Lock()
+
+            with timer('build vocabularies (multiprocessing)', indentation=''):
+                with Pool(processes=n_jobs, initializer=init_process, initargs=(shared_dict, lock)) as pool:
+                    pool.map(wrapper, sentences)
+
+            vocabulary = dict(shared_dict)  # Convert managed dict to normal dict for inspection
+
+    print('\nlen(vocabulary.items())--->', len(vocabulary))
+    print("(vocabulary['from'], vocabulary['gift'])--->", (vocabulary.get('from', 0), vocabulary.get('gift', 0)))
+
@@ -0,0 +1,68 @@
+import string
+import time
+from collections import defaultdict
+from multiprocessing import Pool, cpu_count
+from sklearn import datasets
+
+class timer:
+    def __init__(self, name='', indentation=''):
+        self.start = time.time()
+        self.name = name
+        self.indentation = indentation
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args):
+        print(self.indentation + f'time {self.name}  {round(time.time() - self.start, 4)} sec')
+
+
+def load_data():
+    X_train = datasets.fetch_20newsgroups(subset="train").data
+    y_train = datasets.fetch_20newsgroups(subset="train").target
+    X_test = datasets.fetch_20newsgroups(subset="test").data
+    y_test = datasets.fetch_20newsgroups(subset="test").target
+    return X_train, y_train, X_test, y_test
+
+# Basic tokenizer: lowercases, strips punctuation, and splits on whitespace
+def simple_tokenizer(text):
+    translator = str.maketrans('', '', string.punctuation)
+    return text.lower().translate(translator).split()
+
+def aggregate_dicts(dicts):
+    if len(dicts) == 1:
+        return dicts[0]
+
+    result = dicts[0]
+    for d in dicts[1:]:
+        for k, v in d.items():
+            result[k] += v
+    return result
+
+def build_vocabulary(sentence):
+    vocabulary = defaultdict(int)
+    words = simple_tokenizer(sentence)
+    for word in words:
+        vocabulary[word] += 1
+    return vocabulary
+
+if __name__ == '__main__':
+    n_jobs = 10  # Set based on available cores or workload
+    factor_multiplier = 20  # Ensures 100k  documents
+
+    sentences, _, _, _ = load_data()
+    sentences = sentences * factor_multiplier
+    print(f'num docs = {len(sentences)}\n')
+
+    with timer('overall', indentation=''):
+        with timer('build vocabularies (multiprocessing)', indentation=''):
+            with Pool(processes=n_jobs) as pool:
+                partial_vocabularies = pool.map(build_vocabulary, sentences)
+
+        with timer('aggregate vocabularies', indentation=''):
+            vocabulary = aggregate_dicts(partial_vocabularies)
+
+    print('\nlen(partial_vocabularies)--->', len(partial_vocabularies))
+    print('len(vocabulary.items())--->', len(vocabulary.items()))
+    print("(vocabulary['from'], vocabulary['gift'])--->", (vocabulary['from'], vocabulary['gift']))
+
@@ -0,0 +1,71 @@
+import string
+import time
+from collections import defaultdict
+from threading import Lock
+from multiprocessing.dummy import Pool as ThreadPool
+
+import sklearn
+from sklearn import datasets
+from collections import Counter
+
+
+class timer:
+    def __init__(self, name='', indentation=''):
+        self.start = time.time()
+        self.name = name
+        self.indentation = indentation
+
+    def __enter__(self):
+        pass
+
+    def __exit__(self, *args):
+        print(self.indentation + f'time {self.name}  {round(time.time() - self.start, 4)} sec')
+
+
+def load_data():
+    X_train = datasets.fetch_20newsgroups(subset="train").data
+    y_train = datasets.fetch_20newsgroups(subset="train").target
+    X_test = datasets.fetch_20newsgroups(subset="test").data
+    y_test = datasets.fetch_20newsgroups(subset="test").target
+    return X_train, y_train, X_test, y_test
+
+
+# Basic tokenizer: lowercases, strips punctuation, and splits on whitespace
+def simple_tokenizer(text):
+    translator = str.maketrans('', '', string.punctuation)
+    return text.lower().translate(translator).split()
+
+
+def build_local_vocabulary(sentences_chunk):
+    local_vocab = Counter()
+    for sentence in sentences_chunk:
+        words = simple_tokenizer(sentence)
+        local_vocab.update(words)
+    return local_vocab
+
+if __name__ == '__main__':
+    from math import ceil
+
+    n_jobs = 10
+    factor_multiplier = 20
+
+    sentences, _, _, _ = load_data()
+    sentences = sentences * factor_multiplier
+    print(f'num docs = {len(sentences)}\n')
+
+    chunk_size = ceil(len(sentences) / n_jobs)
+    chunks = [sentences[i:i + chunk_size] for i in range(0, len(sentences), chunk_size)]
+
+    with timer('overall'):
+        with timer('build vocabularies (local merge)'):
+            with ThreadPool(n_jobs) as pool:
+                local_vocabularies = pool.map(build_local_vocabulary, chunks)
+
+            # Merge all local vocabularies into one
+            merged_vocab = Counter()
+            for vocab in local_vocabularies:
+                merged_vocab.update(vocab)
+
+    print('\nlen(vocabulary.items())--->', len(merged_vocab.items()))
+    print("(vocabulary['from'], vocabulary['gift'])--->", (merged_vocab['from'], merged_vocab['gift']))
+