Elastic-Japanese/elasticbulkloader.py at main · sqrtNOT/Elastic-Japanese · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/python3
import os
from elasticsearch import helpers, Elasticsearch


def parsecc100(path):
    dataset = open(path, "r")
    doc = ""
    while True:
        line = dataset.readline()
        if line == "":
            return doc
        if line == "\n":
            yield doc
            doc = ""
        else:
            doc = doc + line


def chunkparsecc100(path, chunksize, index="jpnsearch"):
    dataset = parsecc100(path)
    chunk = ""  # for limiting the size of the payload
    actions = []  # return object
    id = 0
    for doc in dataset:
        id = id + 1
        rownum = 0
        for sentence in doc.split("\n"):
            rownum = rownum + 1
            source = {"document_id": id, "sentence_id": rownum, "sentence": sentence}
            action = {"_index": index, "_source": source}
            actions.append(action)
        chunk += doc
        if chunk.__sizeof__() >= chunksize:
            yield actions
            actions = []
            chunk = ""
    yield actions


client = Elasticsearch(f"http://{os.environ.get('ELASTIC_PASS')}@localhost:9250")

MiB = 64
data = chunkparsecc100("/datasets/cc-100/ja.txt", MiB * 1024**2)
i = 0
for actions in data:
    i = i + 1
    helpers.bulk(client, actions)
    print(f"{i*MiB}MiB loaded")