From cb58c31f520d4d620895c1c695b67648f723a142 Mon Sep 17 00:00:00 2001
From: Zhenzhong1 <zhenzhong.xu@intel.com>
Date: Wed, 20 Nov 2024 01:57:26 +0000
Subject: [PATCH 1/5] fixed the continuous input

---
 ChatQnA/chatqna.py | 60 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 44 insertions(+), 16 deletions(-)

diff --git a/ChatQnA/chatqna.py b/ChatQnA/chatqna.py
index 95318e9613..f409fb54bb 100644
--- a/ChatQnA/chatqna.py
+++ b/ChatQnA/chatqna.py
@@ -64,6 +64,7 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
         next_inputs = {}
         next_inputs["model"] = LLM_MODEL
         next_inputs["messages"] = [{"role": "user", "content": inputs["inputs"]}]
+        print('LLM INPUT -----------------------------', next_inputs['messages'])
         next_inputs["max_tokens"] = llm_parameters_dict["max_tokens"]
         next_inputs["top_p"] = llm_parameters_dict["top_p"]
         next_inputs["stream"] = inputs["streaming"]
@@ -155,29 +156,56 @@ def align_outputs(self, data, cur_node, inputs, runtime_graph, llm_parameters_di
 
     return next_data
 
-
 def align_generator(self, gen, **kwargs):
+    def split_lines(line):
+        """
+        Split line into individual `data:` segments if multiple `data:` sections exist.
+        """
+        parts = line.split("data:")
+        return [f"data:{part.strip()}\n\n" for part in parts if part.strip()]
     # openai reaponse format
     # b'data:{"id":"","object":"text_completion","created":1725530204,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.1-native","choices":[{"index":0,"delta":{"role":"assistant","content":"?"},"logprobs":null,"finish_reason":null}]}\n\n'
     for line in gen:
         line = line.decode("utf-8")
-        start = line.find("{")
-        end = line.rfind("}") + 1
-
-        json_str = line[start:end]
-        try:
-            # sometimes yield empty chunk, do a fallback here
-            json_data = json.loads(json_str)
-            if (
-                json_data["choices"][0]["finish_reason"] != "eos_token"
-                and "content" in json_data["choices"][0]["delta"]
-            ):
-                yield f"data: {repr(json_data['choices'][0]['delta']['content'].encode('utf-8'))}\n\n"
-        except Exception as e:
-            yield f"data: {repr(json_str.encode('utf-8'))}\n\n"
+        
+        if line.count("data:") > 1:
+            split_data = split_lines(line)
+            
+            for part in split_data:
+                print("split_data-------------", part)
+                start = part.find("{")
+                end = part.rfind("}") + 1
+                
+                json_str = part[start:end]
+                try:
+                    # sometimes yield empty chunk, do a fallback here
+                    json_data = json.loads(json_str)
+                    if (
+                        json_data["choices"][0]["finish_reason"] != "eos_token"
+                        and "content" in json_data["choices"][0]["delta"]
+                    ):
+                        yield f"data: {repr(json_data['choices'][0]['delta']['content'].encode('utf-8'))}\n\n"
+                except Exception as e:
+                    yield f"data: {repr(json_str.encode('utf-8'))}\n\n"
+        else:
+            print("line-------------", line)
+            start = line.find("{")
+            end = line.rfind("}") + 1
+
+            json_str = line[start:end]
+            try:
+                # sometimes yield empty chunk, do a fallback here
+                json_data = json.loads(json_str)
+                if (
+                    json_data["choices"][0]["finish_reason"] != "eos_token"
+                    and "content" in json_data["choices"][0]["delta"]
+                ):
+                    yield f"data: {repr(json_data['choices'][0]['delta']['content'].encode('utf-8'))}\n\n"
+            except Exception as e:
+                yield f"data: {repr(json_str.encode('utf-8'))}\n\n"
+            
     yield "data: [DONE]\n\n"
 
-
 class ChatQnAService:
     def __init__(self, host="0.0.0.0", port=8000):
         self.host = host

From 5a87bceda23ef0f0160a9a34549d40dd8e565722 Mon Sep 17 00:00:00 2001
From: Zhenzhong1 <zhenzhong.xu@intel.com>
Date: Tue, 19 Nov 2024 18:02:15 -0800
Subject: [PATCH 2/5] added chatqna.py

---
 ChatQnA/chatqna.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/ChatQnA/chatqna.py b/ChatQnA/chatqna.py
index f409fb54bb..dd5afaa026 100644
--- a/ChatQnA/chatqna.py
+++ b/ChatQnA/chatqna.py
@@ -48,6 +48,7 @@ def generate_rag_prompt(question, documents):
 LLM_SERVER_HOST_IP = os.getenv("LLM_SERVER_HOST_IP", "0.0.0.0")
 LLM_SERVER_PORT = int(os.getenv("LLM_SERVER_PORT", 80))
 LLM_MODEL = os.getenv("LLM_MODEL", "Intel/neural-chat-7b-v3-3")
+LLM_PROMPT = os.getenv("LLM_PROMPT", None)
 
 
 def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs):
@@ -63,8 +64,11 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
         # convert TGI/vLLM to unified OpenAI /v1/chat/completions format
         next_inputs = {}
         next_inputs["model"] = LLM_MODEL
-        next_inputs["messages"] = [{"role": "user", "content": inputs["inputs"]}]
-        print('LLM INPUT -----------------------------', next_inputs['messages'])
+        if LLM_PROMPT is None:
+            next_inputs["messages"] = [{"role": "user", "content": inputs["inputs"]}]
+        else:
+            next_inputs['messages'] = [{"role": "user", "content": LLM_PROMPT}]
+            
         next_inputs["max_tokens"] = llm_parameters_dict["max_tokens"]
         next_inputs["top_p"] = llm_parameters_dict["top_p"]
         next_inputs["stream"] = inputs["streaming"]

From 64ab7f124a317b21e8a32f7c2de2e2c038bc55bb Mon Sep 17 00:00:00 2001
From: Zhenzhong1 <zhenzhong.xu@intel.com>
Date: Wed, 27 Nov 2024 17:58:09 -0800
Subject: [PATCH 3/5] added RAG FLAG

---
 ChatQnA/chatqna.py | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/ChatQnA/chatqna.py b/ChatQnA/chatqna.py
index dd5afaa026..9c5d4e9761 100644
--- a/ChatQnA/chatqna.py
+++ b/ChatQnA/chatqna.py
@@ -49,6 +49,7 @@ def generate_rag_prompt(question, documents):
 LLM_SERVER_PORT = int(os.getenv("LLM_SERVER_PORT", 80))
 LLM_MODEL = os.getenv("LLM_MODEL", "Intel/neural-chat-7b-v3-3")
 LLM_PROMPT = os.getenv("LLM_PROMPT", None)
+RAG = os.getenv("RAG", False)
 
 
 def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs):
@@ -65,9 +66,24 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
         next_inputs = {}
         next_inputs["model"] = LLM_MODEL
         if LLM_PROMPT is None:
-            next_inputs["messages"] = [{"role": "user", "content": inputs["inputs"]}]
+            if RAG == True:
+                print(f"LLM_PROMPT is None, RAG == True, INPUTS = {inputs['inputs']} \n\n")
+                question_index = inputs["inputs"].find("\n\n### Question:")
+                cleaned_query = inputs["inputs"][:question_index + len("\n\n### Question:")]
+                print(f"LLM_PROMPT is None, RAG == True, CLEANED_QUERY: {cleaned_query} \n\n")
+                next_inputs["messages"] = [{"role": "user", "content": cleaned_query}]
+            else:
+                next_inputs["messages"] = [{"role": "user", "content": inputs["inputs"]}]
+            
         else:
-            next_inputs['messages'] = [{"role": "user", "content": LLM_PROMPT}]
+            if RAG == True:
+                print(f"LLM_PROMPT is not None, RAG == True, INPUTS = {inputs['inputs']} \n\n")
+                question_index = inputs["inputs"].find("\n\n### Question:")
+                cleaned_query = inputs["inputs"][:question_index + len("\n\n### Question:")]
+                print(f"LLM_PROMPT is not None, RAG == True, CLEANED_QUERY: {cleaned_query} \n\n")
+                next_inputs["messages"] = [{"role": "user", "content": cleaned_query}]
+            else:
+                next_inputs['messages'] = [{"role": "user", "content": LLM_PROMPT}]
             
         next_inputs["max_tokens"] = llm_parameters_dict["max_tokens"]
         next_inputs["top_p"] = llm_parameters_dict["top_p"]

From c101a51ecb0bf506d4cbba975b03af6cd225637c Mon Sep 17 00:00:00 2001
From: Zhenzhong1 <zhenzhong.xu@intel.com>
Date: Wed, 27 Nov 2024 19:03:09 -0800
Subject: [PATCH 4/5] rename RAG env

---
 ChatQnA/chatqna.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ChatQnA/chatqna.py b/ChatQnA/chatqna.py
index 9c5d4e9761..bc5f69e927 100644
--- a/ChatQnA/chatqna.py
+++ b/ChatQnA/chatqna.py
@@ -49,7 +49,7 @@ def generate_rag_prompt(question, documents):
 LLM_SERVER_PORT = int(os.getenv("LLM_SERVER_PORT", 80))
 LLM_MODEL = os.getenv("LLM_MODEL", "Intel/neural-chat-7b-v3-3")
 LLM_PROMPT = os.getenv("LLM_PROMPT", None)
-RAG = os.getenv("RAG", False)
+RAG = os.getenv("RAG", None)
 
 
 def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs):
@@ -66,7 +66,7 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
         next_inputs = {}
         next_inputs["model"] = LLM_MODEL
         if LLM_PROMPT is None:
-            if RAG == True:
+            if RAG == "enabled":
                 print(f"LLM_PROMPT is None, RAG == True, INPUTS = {inputs['inputs']} \n\n")
                 question_index = inputs["inputs"].find("\n\n### Question:")
                 cleaned_query = inputs["inputs"][:question_index + len("\n\n### Question:")]
@@ -76,7 +76,7 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
                 next_inputs["messages"] = [{"role": "user", "content": inputs["inputs"]}]
             
         else:
-            if RAG == True:
+            if RAG == "enabled":
                 print(f"LLM_PROMPT is not None, RAG == True, INPUTS = {inputs['inputs']} \n\n")
                 question_index = inputs["inputs"].find("\n\n### Question:")
                 cleaned_query = inputs["inputs"][:question_index + len("\n\n### Question:")]

From 205ba43e9fa19f3e6dbafefb31f28e5bc9f181d8 Mon Sep 17 00:00:00 2001
From: Zhenzhong1 <zhenzhong.xu@intel.com>
Date: Thu, 28 Nov 2024 02:28:11 -0800
Subject: [PATCH 5/5] removed ###

---
 ChatQnA/chatqna.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/ChatQnA/chatqna.py b/ChatQnA/chatqna.py
index bc5f69e927..b90807a869 100644
--- a/ChatQnA/chatqna.py
+++ b/ChatQnA/chatqna.py
@@ -68,8 +68,8 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
         if LLM_PROMPT is None:
             if RAG == "enabled":
                 print(f"LLM_PROMPT is None, RAG == True, INPUTS = {inputs['inputs']} \n\n")
-                question_index = inputs["inputs"].find("\n\n### Question:")
-                cleaned_query = inputs["inputs"][:question_index + len("\n\n### Question:")]
+                question_index = inputs["inputs"].find("### Question:")
+                cleaned_query = inputs["inputs"][:question_index + len("### Question:")]
                 print(f"LLM_PROMPT is None, RAG == True, CLEANED_QUERY: {cleaned_query} \n\n")
                 next_inputs["messages"] = [{"role": "user", "content": cleaned_query}]
             else:
@@ -78,8 +78,8 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
         else:
             if RAG == "enabled":
                 print(f"LLM_PROMPT is not None, RAG == True, INPUTS = {inputs['inputs']} \n\n")
-                question_index = inputs["inputs"].find("\n\n### Question:")
-                cleaned_query = inputs["inputs"][:question_index + len("\n\n### Question:")]
+                question_index = inputs["inputs"].find("### Question:")
+                cleaned_query = inputs["inputs"][:question_index + len("### Question:")]
                 print(f"LLM_PROMPT is not None, RAG == True, CLEANED_QUERY: {cleaned_query} \n\n")
                 next_inputs["messages"] = [{"role": "user", "content": cleaned_query}]
             else:
@@ -208,7 +208,6 @@ def split_lines(line):
                 except Exception as e:
                     yield f"data: {repr(json_str.encode('utf-8'))}\n\n"
         else:
-            print("line-------------", line)
             start = line.find("{")
             end = line.rfind("}") + 1