diff --git a/__pycache__/app.cpython-313.pyc b/__pycache__/app.cpython-313.pyc index 7b0a698..b2a1bad 100644 Binary files a/__pycache__/app.cpython-313.pyc and b/__pycache__/app.cpython-313.pyc differ diff --git a/__pycache__/console.cpython-313.pyc b/__pycache__/console.cpython-313.pyc index d51f772..a056d7c 100644 Binary files a/__pycache__/console.cpython-313.pyc and b/__pycache__/console.cpython-313.pyc differ diff --git a/__pycache__/legal_provisions_loader.cpython-313.pyc b/__pycache__/legal_provisions_loader.cpython-313.pyc new file mode 100644 index 0000000..859f81b Binary files /dev/null and b/__pycache__/legal_provisions_loader.cpython-313.pyc differ diff --git a/__pycache__/mode.cpython-313.pyc b/__pycache__/mode.cpython-313.pyc index acc2240..096b237 100644 Binary files a/__pycache__/mode.cpython-313.pyc and b/__pycache__/mode.cpython-313.pyc differ diff --git a/legal_provisions_loader.py b/legal_provisions_loader.py new file mode 100644 index 0000000..12e0ec1 --- /dev/null +++ b/legal_provisions_loader.py @@ -0,0 +1,71 @@ +from typing import Iterator, Tuple + +from re import compile, MULTILINE + +from langchain_core.document_loaders import BaseLoader +from langchain_core.documents import Document + +class LegalProvisionsLoader(): + RE_LEVELS = { + "partie": compile(r'^\s*(Partie [^\n]+)', flags=MULTILINE), + "livre": compile(r'^(Livre [^\n]+)', flags=MULTILINE), + "titre": compile(r'^(Titre [^\n]+)', flags=MULTILINE), + "chapitre": compile(r'^(Chapitre [^\n]+)', flags=MULTILINE), + "article": compile(r'^(Article [^\n]+)', flags=MULTILINE) + } + + def __init__(self, file_path: str) -> None: + self.file_path = file_path + + self.code, self.partie = self._extract_code_partie() + + def _extract_code_partie(self) -> Tuple[str, str]: + with open(self.file_path, 'r', encoding='utf-8') as file: + lines = [next(file) for _ in range(3)] + + match = self.RE_LEVELS["partie"].match(lines[2]) + + return lines[0].strip(), match.group(1).strip() if match else None + + def lazy_load(self) -> Iterator[Document]: + current = { + "livre": None, + "titre": None, + "chapitre": None, + "article": None + } + buffer = [] + + def flush(): + if current['article'] and buffer: + yield Document( + page_content=''.join(buffer).strip(), + metadata={ + "code": self.code, + "partie": self.partie, + **current, + "source": self.file_path + } + ) + + with open(self.file_path, 'r', encoding='utf-8') as file: + for _ in range(3): + next(file) + + for line in file: + for level in ("livre", "titre", "chapitre"): + match = self.RE_LEVELS[level].match(line) + if match: + current[level] = match.group(1).strip() + break + else: + match = self.RE_LEVELS["article"].match(line) + if match: + yield from flush() + + current["article"] = match.group(1).strip() + buffer = [line] + else: + buffer.append(line) + + yield from flush() \ No newline at end of file diff --git a/main.py b/main.py index 29c01d1..d9185e6 100755 --- a/main.py +++ b/main.py @@ -10,6 +10,9 @@ from modes.load_haiku_mode import LoadHaikuMode from modes.ask_mode import AskMode +from modes.legal_provisions_mode import LegalProvisionsMode +from modes.load_legal_provisions_mode import LoadLegalProvisionsMode + load_dotenv() if __name__ == "__main__": @@ -30,9 +33,14 @@ def sigkill_handler(sig, frame): app.use("chat", ChatMode) app.use("ask", AskMode) + app.use("haiku", HaikuMode) app.use("load-haiku", LoadHaikuMode) - app.use("load-book", LoadBookMode) + app.use("book", BookMode) + app.use("load-book", LoadBookMode) + + app.use("legal-provisions", LegalProvisionsMode) + app.use("load-legal-provisions", LoadLegalProvisionsMode) app.run() diff --git a/modes/__pycache__/ask_mode.cpython-313.pyc b/modes/__pycache__/ask_mode.cpython-313.pyc index 0665edd..98c51b6 100644 Binary files a/modes/__pycache__/ask_mode.cpython-313.pyc and b/modes/__pycache__/ask_mode.cpython-313.pyc differ diff --git a/modes/__pycache__/book_mode.cpython-313.pyc b/modes/__pycache__/book_mode.cpython-313.pyc index 7877e29..6051793 100644 Binary files a/modes/__pycache__/book_mode.cpython-313.pyc and b/modes/__pycache__/book_mode.cpython-313.pyc differ diff --git a/modes/__pycache__/chat_mode.cpython-313.pyc b/modes/__pycache__/chat_mode.cpython-313.pyc index bce648c..7559eb7 100644 Binary files a/modes/__pycache__/chat_mode.cpython-313.pyc and b/modes/__pycache__/chat_mode.cpython-313.pyc differ diff --git a/modes/__pycache__/haiku_mode.cpython-313.pyc b/modes/__pycache__/haiku_mode.cpython-313.pyc index bd68c27..20a4f0d 100644 Binary files a/modes/__pycache__/haiku_mode.cpython-313.pyc and b/modes/__pycache__/haiku_mode.cpython-313.pyc differ diff --git a/modes/__pycache__/legal_provisions_mode.cpython-313.pyc b/modes/__pycache__/legal_provisions_mode.cpython-313.pyc new file mode 100644 index 0000000..c792616 Binary files /dev/null and b/modes/__pycache__/legal_provisions_mode.cpython-313.pyc differ diff --git a/modes/__pycache__/load_book_mode.cpython-313.pyc b/modes/__pycache__/load_book_mode.cpython-313.pyc index 767f53b..419d8a5 100644 Binary files a/modes/__pycache__/load_book_mode.cpython-313.pyc and b/modes/__pycache__/load_book_mode.cpython-313.pyc differ diff --git a/modes/__pycache__/load_haiku_mode.cpython-313.pyc b/modes/__pycache__/load_haiku_mode.cpython-313.pyc index 5207854..85c2721 100644 Binary files a/modes/__pycache__/load_haiku_mode.cpython-313.pyc and b/modes/__pycache__/load_haiku_mode.cpython-313.pyc differ diff --git a/modes/__pycache__/load_legal_provisions_mode.cpython-313.pyc b/modes/__pycache__/load_legal_provisions_mode.cpython-313.pyc new file mode 100644 index 0000000..1366822 Binary files /dev/null and b/modes/__pycache__/load_legal_provisions_mode.cpython-313.pyc differ diff --git a/modes/legal_provisions_mode.py b/modes/legal_provisions_mode.py new file mode 100644 index 0000000..73471c3 --- /dev/null +++ b/modes/legal_provisions_mode.py @@ -0,0 +1,147 @@ +from os import getenv + +from argparse import _SubParsersAction + +from langchain.chat_models import init_chat_model +from langchain.output_parsers import BooleanOutputParser + +from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, MessagesPlaceholder, HumanMessagePromptTemplate +from langchain_core.output_parsers import StrOutputParser +from langchain_core.messages import HumanMessage, AIMessage, BaseMessage + +from langchain_openai.embeddings import OpenAIEmbeddings +from langchain_chroma import Chroma + +from mode import Mode +from console import Console + +class LegalProvisionsMode(Mode): + history: list[BaseMessage] = [] + + def __init__( + self, + console: Console, + model: str = "gpt-4o-mini", + system: str = "default", + verbose: bool = False + ): + super().__init__(console) + + self.model = model + self.system = system + self.verbose = verbose + + @staticmethod + def add_subparser(name: str, subparser: _SubParsersAction): + chat_subparser = subparser.add_parser(name) + chat_subparser.add_argument("--model", type=str, default="gpt-4o-mini") + chat_subparser.add_argument("--system", type=str, default="default") + chat_subparser.add_argument("--verbose", "-v", action="store_true") + + def __should_retrieve_from_history_only( + self, + model + ): + message_prompt = """ + Compte tenu de l'historique de conversation, réponds par 'oui' ou par 'non' s'il est possible de répondre à la question de l'utilisateur en fonction des informations déjà disponible dans l'historique.\n + Réponds excclusivement soit 'oui', soit 'non' et rien de plus. + """ + + prompt = ChatPromptTemplate.from_messages([ + SystemMessagePromptTemplate.from_template(message_prompt), + MessagesPlaceholder(variable_name="history"), + ]) + + chain = prompt | model | BooleanOutputParser(false_val='non', true_val='oui') + response = chain.invoke({ 'history': self.history }) + + return response + + + def run(self): + # Load model + if self.verbose: + self.console.info(f"Loading model {self.model}...") + + model = init_chat_model( + self.model, + model_provider="openai", + api_key = getenv('OPENAI_API_KEY') + ) + + # Load vector store + if self.verbose: + self.console.info(f"Loading embedding {getenv('EMBEDDING_MODEL')}") + + embeddings = OpenAIEmbeddings( + model=getenv('EMBEDDING_MODEL'), + api_key=getenv('OPENAI_API_KEY') + ) + + vector_store = Chroma( + collection_name='legal-provisions', + embedding_function=embeddings, + persist_directory="./.store" + ) + + # System prompt + system_prompt = """ + Réponds à la question de l’utilisateur en t’appuyant sur le contenue de l'historique de conversation et en t'appuyant sur les extraits du code de l'action sociale et des familles fourni ci-dessous. + + Voici les extraits pertinents des articles de loi relatifs à la question de l’utilisateur : + + {documents} + """ + + if self.verbose: + self.console.system_output(system_prompt) + + # Create prompt + prompt = ChatPromptTemplate.from_messages([ + SystemMessagePromptTemplate.from_template(system_prompt), + MessagesPlaceholder(variable_name="messages"), + ]) + + # Create chain + chain = prompt | model | StrOutputParser() + + while True: + user_input = self.console.human_input() + self.history.append(HumanMessage(user_input)) + + should_retrieve = self.__should_retrieve_from_history_only(model=model) + if self.verbose: + self.console.info(f"Should retrive from history only : {should_retrieve}") + + if not should_retrieve: + documents = vector_store.similarity_search(query=user_input, k=5) + + if self.verbose: + for document in documents: + self.console.info(f'{document} \n') + + self.console.bot_start() + stream = chain.stream({ + "messages": self.history, + "documents": documents + }) + bot_message = "" + for chunk in stream: + bot_message += chunk + self.console.bot_chunk(chunk=chunk) + self.console.bot_end() + + self.history.append(AIMessage(content=bot_message)) + else: + self.console.bot_start() + stream = chain.stream({ + "messages": self.history, + "documents": None + }) + bot_message = "" + for chunk in stream: + bot_message += chunk + self.console.bot_chunk(chunk=chunk) + self.console.bot_end() + + self.history.append(AIMessage(content=bot_message)) \ No newline at end of file diff --git a/modes/load_legal_provisions_mode.py b/modes/load_legal_provisions_mode.py new file mode 100644 index 0000000..dff1fdd --- /dev/null +++ b/modes/load_legal_provisions_mode.py @@ -0,0 +1,46 @@ +from os import path, getenv + +from argparse import _SubParsersAction + +from langchain_openai.embeddings import OpenAIEmbeddings +from langchain_chroma import Chroma + +from mode import Mode +from console import Console +from legal_provisions_loader import LegalProvisionsLoader + +class LoadLegalProvisionsMode(Mode): + def __init__( + self, + console: Console, + book: str, + verbose: bool = False): + super().__init__(console) + + self.book = book + self.verbose = verbose + + @staticmethod + def add_subparser(name: str, subparser: _SubParsersAction): + load_book_subparser = subparser.add_parser(name) + load_book_subparser.add_argument("book", type=str, help="The book to load") + load_book_subparser.add_argument("--verbose", "-v", action="store_true", help="Verbose mode") + + def run(self): + self.console.info(f"Loading book {self.book}...") + + loader = LegalProvisionsLoader(self.book) + + embeddings = OpenAIEmbeddings( + model = getenv('EMBEDDING_MODEL'), + api_key = getenv('OPENAI_API_KEY') + ) + + vector_store = Chroma( + collection_name='legal-provisions', + embedding_function=embeddings, + persist_directory=getenv('VECTOR_STORE_DATA') + ) + + for doc in loader.lazy_load(): + vector_store.add_documents([doc]) diff --git a/store/7e6e4430-60d1-424b-a5f1-b998102c8f02/data_level0.bin b/store/7e6e4430-60d1-424b-a5f1-b998102c8f02/data_level0.bin deleted file mode 100644 index 27cc0f4..0000000 Binary files a/store/7e6e4430-60d1-424b-a5f1-b998102c8f02/data_level0.bin and /dev/null differ diff --git a/store/7e6e4430-60d1-424b-a5f1-b998102c8f02/header.bin b/store/7e6e4430-60d1-424b-a5f1-b998102c8f02/header.bin deleted file mode 100644 index c5f0b90..0000000 Binary files a/store/7e6e4430-60d1-424b-a5f1-b998102c8f02/header.bin and /dev/null differ diff --git a/store/7e6e4430-60d1-424b-a5f1-b998102c8f02/length.bin b/store/7e6e4430-60d1-424b-a5f1-b998102c8f02/length.bin deleted file mode 100644 index ba4322f..0000000 Binary files a/store/7e6e4430-60d1-424b-a5f1-b998102c8f02/length.bin and /dev/null differ diff --git a/store/7e6e4430-60d1-424b-a5f1-b998102c8f02/link_lists.bin b/store/7e6e4430-60d1-424b-a5f1-b998102c8f02/link_lists.bin deleted file mode 100644 index e69de29..0000000 diff --git a/store/chroma.sqlite3 b/store/chroma.sqlite3 index b376f2c..58187ab 100644 Binary files a/store/chroma.sqlite3 and b/store/chroma.sqlite3 differ