-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathsplitTexts_loadDocs.py
More file actions
32 lines (24 loc) · 1.09 KB
/
splitTexts_loadDocs.py
File metadata and controls
32 lines (24 loc) · 1.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
from langchain.text_splitter import SpacyTextSplitter
from langchain.docstore.document import Document
class TextSplitterDocLoader:
def __init__(self, texts):
self.texts = texts
def pre_split_text(self, text, max_length=900000):
return [text[i:i+max_length] for i in range(0, len(text), max_length)]
def split_texts_load_docs(self):
# Split the text using SpacyTextSplitter
splitter = SpacyTextSplitter(
chunk_size=1000,
chunk_overlap=50,
)
splitted_texts = []
for text in self.texts:
# Pre-split the text if it's too long
pre_split = self.pre_split_text(text)
for chunk in pre_split:
# Split each pre-split chunk individually and extend the results to the splitted_texts list
splitted_texts.extend(splitter.split_text(chunk))
# Create Document objects for each splitted text
docs = [Document(page_content=text) for text in splitted_texts]
print(f"Created {len(docs)} documents")
return docs