scrapp/simple.py at main · plehman2000/scrapp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import marimo

__generated_with = "0.13.6"
app = marimo.App(width="medium")


@app.cell
def _():
    import ollama
    import json
    # text = ""
    # schema = """ {"title": ""}"""
    # input_llm = f"""
    # You extract the title from the first part of this chunk, ignoring the author names afterward### Template:
    # {json.dumps(json.loads(schema), indent=4)}
    # ### Example:
    # {{"title": "Amazing new discovery"}}
    # ### Text:
    # {text}
    # """

    # response = ollama.chat(model='nuextract', messages=[ #llama3
    # {
    # 'role': 'user',
    # 'content': input_llm}])#, options={"temperature":.5}

    # output = response['message']['content']

    # x = output.replace("<|end-output|>","")#output[output.find("<|end-output|>"):]
    # print(x)

    return json, ollama


@app.cell
def _(json, ollama):
    MODEL = 'gemma3:12b'


    def extract_entities(text):
        input_llm = f"""
        Extract all entites from this chunk of text, return as a json:
        ### Text:
        {text}
        """

        response = ollama.chat(model=MODEL, messages=[ #llama3
        {
        'role': 'user',
        'content': input_llm}])#, options={"temperature":.5}

        output = response['message']['content']

        # x = output.replace("<|end-output|>","")#output[output.find("<|end-output|>"):]
        # print(x)
        return output

    def extract_facts(text):
        schema = """ {"entity1": ["fact1", "fact2"]}"""
        # input_llm = f"""{json.dumps(json.loads(schema), indent=4)}

        """    Using the following list of entities:
        {entities}
        """

        input_llm = f"""
        Collect all the atomic propositions about the entities in this list and put them in a json of the following form, returning only this json. Word each fact such that is uses the entity as the first word in the sentence:
        {json.dumps(json.loads(schema), indent=4)}

        Here's the text:

        {text}
        """
        #dolphin-llama3
        response = ollama.chat(model=MODEL, messages=[ #llama3
        {
        'role': 'user',
        'content': input_llm}])#, options={"temperature":.5}

        output = response['message']['content']

        # x = output.replace("<|end-output|>","")#output[output.find("<|end-output|>"):]
        # print(x)
        return output


    return (extract_facts,)


@app.cell
def _():
    from langchain_text_splitters import RecursiveCharacterTextSplitter


    chunk_size = 1000
    splitter = RecursiveCharacterTextSplitter(
        # Set a really small chunk size, just to show.
        chunk_size=1000,
        chunk_overlap=int( chunk_size * 0.1),
        length_function=len,
        is_separator_regex=False,
    )
    # Split the content into chunks
    # chunks = splitter.chunks(content)


    return (splitter,)


@app.cell
def _():
    all_text = open(r"docs\rando.txt", encoding="utf8").read()
    return (all_text,)


@app.cell
def _(all_text, splitter):
    chunks = splitter.split_text(all_text)
    print(chunks[1])
    return (chunks,)


@app.cell
def _(chunks, extract_facts):
    # ents = extract_entities(chunks[0])
    facts = extract_facts(chunks[0])
    # entities_json = json.loads(ents[7t:-3])

    return (facts,)


@app.cell
def _(facts, json):
    facts_json = json.loads(facts[7:-3])

    return (facts_json,)


@app.cell
def _(facts_json):
    facts_json
    return


@app.cell
def _():
    """
    1. extract all entities, use coreference resolution
    2. Resolve entities
        1. check for plurality? string matching
        2. cosine similarity of embedding


    [17`68yxdj17g] = ["[viral biology]" , "[the study of viruses]"]


    1. "[viral biology] is a branch of [biology]"
    2.                             "[Biology] is the science that studies life"

    3. [Virology] is a branch of "the science that studies life"

    """
    return


@app.cell
def _(entities_json):
    import pytholog as pl
    bio_kb = pl.KnowledgeBase("biology")
    fs = [f"type({x['entity']}, { x['type']})".lower() for x in entities_json['entities']]
    bio_kb(fs)
    print(fs)
    return


if __name__ == "__main__":
    app.run()