-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathllm.py
More file actions
96 lines (86 loc) · 3.24 KB
/
llm.py
File metadata and controls
96 lines (86 loc) · 3.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os
import warnings
from openai import OpenAI
from transformers import pipeline
from utils import measure_execution_time
import requests
from dotenv import load_dotenv
import configparser
import logging
class LocalServerLlm(OpenAI):
"""
This class can be used to make the langchain API work with a local server hosted through
the llama-cpp-python project.
"""
def __init__(self):
print(">> LocalServerLlm init")
config = configparser.ConfigParser()
config.read('config.ini')
os.environ[
"OPENAI_API_KEY"
] = "lm-studio" # can be anything
os.environ["OPENAI_API_BASE"] = f"http://localhost:{config.get('INFERENCE', 'LOCAL_WEB_SERVER_PORT')}/v1"
os.environ["OPENAI_API_HOST"] = f"http://localhost:{config.get('INFERENCE', 'LOCAL_WEB_SERVER_PORT')}"
warnings.filterwarnings("ignore")
super().__init__()
class HuggingFaceLlm:
def __init__(self):
self.config = configparser.ConfigParser()
self.config.read('config.ini')
self.MODEL_MAX_OUTPUT_TOKENS = self.config.getint('INFERENCE', 'MODEL_MAX_OUTPUT_TOKENS')
self.API_URL = self.config.get('INFERENCE', 'HUGGINGFACE_INFERENCE_URL')
load_dotenv()
self.headers = {"Authorization": f"Bearer **************************************"}
def query(self, payload):
response = requests.post(self.API_URL, headers=self.headers, json=payload).json()
return response
@measure_execution_time(">> LLM query")
def __call__(self, message):
result = self.query({
"inputs": message,
"parameters": {
"max_new_tokens": self.MODEL_MAX_OUTPUT_TOKENS,
"return_full_text": True,
"temperature": 0.5,
"max_time": 100,
},
"options": {
"use_cache": False,
"wait_for_model": True,
}
})
if result:
logging.info(f"LLM response: {result}")
return result[0]["generated_text"]
class TogetherLlm:
def __init__(self):
self.config = configparser.ConfigParser()
self.config.read('config.ini')
self.MODEL_MAX_OUTPUT_TOKENS = self.config.getint('INFERENCE', 'MODEL_MAX_OUTPUT_TOKENS')
self.url = self.config.get('INFERENCE', 'TOGETHER_INFERENCE_URL')
self.headers = {
"accept": "application/json",
"content-type": "application/json",
"Authorization": "Bearer ********************************"
}
def query(self, payload):
response = requests.post(self.url, headers=self.headers, json=payload).json()
print(response)
return response
@measure_execution_time(">> LLM query")
def __call__(self, prompt):
result = self.query(
payload={
"model": "codellama/CodeLlama-7b-Instruct-hf",
"prompt": "<s>[INST]" + prompt + "[/INST]",
"max_tokens": 512,
"stop": ["</s>", "[/INST]"],
"temperature": 0.4,
"top_p": 0.7,
"top_k": 50,
"repetition_penalty": 1,
"n": 1
}
)
if result:
return result