This commit is contained in:
xyj 2024-01-12 10:26:57 +08:00
commit 96ef0fc852
172 changed files with 2470833 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
.idea

148
ai.py Normal file
View File

@ -0,0 +1,148 @@
import requests
import uvicorn
from fastapi import FastAPI, WebSocket
import base64
import datetime
import hashlib
import hmac
import json
from urllib.parse import urlparse
from datetime import datetime
from time import mktime
from urllib.parse import urlencode
from wsgiref.handlers import format_date_time
import websockets
app = FastAPI()
Spark_url = {
"v1.1": "wss://spark-api.xf-yun.com/v1.1/chat",
"v2.1": "wss://spark-api.xf-yun.com/v2.1/chat",
"v3.1": "wss://spark-api.xf-yun.com/v3.1/chat"
}
text = []
class Ws_Param(object):
# 初始化
def __init__(self, APPID, APIKey, APISecret, Spark_url):
self.APPID = APPID
self.APIKey = APIKey
self.APISecret = APISecret
self.host = urlparse(Spark_url).netloc
self.path = urlparse(Spark_url).path
self.Spark_url = Spark_url
# 生成url
def create_url(self):
# 生成RFC1123格式的时间戳
now = datetime.now()
date = format_date_time(mktime(now.timetuple()))
# 拼接字符串
signature_origin = "host: " + self.host + "\n"
signature_origin += "date: " + date + "\n"
signature_origin += "GET " + self.path + " HTTP/1.1"
# 进行hmac-sha256进行加密
signature_sha = hmac.new(self.APISecret.encode('utf-8'), signature_origin.encode('utf-8'),
digestmod=hashlib.sha256).digest()
signature_sha_base64 = base64.b64encode(signature_sha).decode(encoding='utf-8')
authorization_origin = f'api_key="{self.APIKey}", algorithm="hmac-sha256", headers="host date request-line", signature="{signature_sha_base64}"'
authorization = base64.b64encode(authorization_origin.encode('utf-8')).decode(encoding='utf-8')
# 将请求的鉴权参数组合为字典
v = {
"authorization": authorization,
"date": date,
"host": self.host
}
# 拼接鉴权参数生成url
url = self.Spark_url + '?' + urlencode(v)
# 此处打印出建立连接时候的url,参考本demo的时候可取消上方打印的注释比对相同参数时生成的url与自己代码生成的url是否一致
return url
def getText(role, content):
jsoncon = {}
jsoncon["role"] = role
jsoncon["content"] = content
text.append(jsoncon)
return text
def getlength(text):
length = 0
for content in text:
temp = content["content"]
leng = len(temp)
length += leng
return length
def checklen(text):
while (getlength(text) >= 8192):
del text[0]
return text
def v1wsUrl(version, appid, api_secret, api_key):
wsParam = Ws_Param(appid, api_key, api_secret, Spark_url[version])
wsUrl = wsParam.create_url()
return wsUrl
def get_query(data, knowledge_base_name, score_threshold, top_k):
content = data["payload"]["message"]["text"][-1] # 数组最后一个
query = content["content"]
q = {
"knowledge_base_name": knowledge_base_name,
"query": query,
"score_threshold": score_threshold,
"top_k": top_k
}
docs = requests.post("http://127.0.0.1:7861/knowledge_base/search_docs", json=q).json()
if len(docs) <= 0:
return query
contexts = [doc['page_content'] for doc in docs]
wiki_content = "\n".join(contexts)
prompt = "请将以下内容作为已知信息:\n" + wiki_content + (
"\n请根据以上内容回答用户的问题。\n问题:\n") + query + "\n回答: "
return prompt
@app.websocket("/{version}/chat")
async def wsk(ws: WebSocket, version: str):
try:
await ws.accept()
appid = ws.query_params.get("appid")
api_secret = ws.query_params.get("api_secret")
api_key = ws.query_params.get("api_key")
knowledge_base_name = ws.query_params.get("knowledge_base_name") or "threshold"
score_threshold = ws.query_params.get("score_threshold") or 0.5
top_k = ws.query_params.get("top_k") or 2
wsUrl = v1wsUrl(version, appid, api_secret, api_key)
r = await ws.receive_json()
r["payload"]["message"]["text"][-1]["content"] = get_query(r, knowledge_base_name, score_threshold, top_k)
r["payload"]["message"]["text"] = checklen(r["payload"]["message"]["text"])
try:
async with websockets.connect(wsUrl) as websocket:
await websocket.send(json.dumps(r))
async for message in websocket:
data = json.loads(message)
await ws.send_text(json.dumps(data, ensure_ascii=False))
status = data["payload"]["choices"]["status"]
if status == 2:
await ws.close()
except Exception as e:
await ws.send_text(str(e))
except Exception as e:
print(e)
await ws.close()
if __name__ == '__main__':
uvicorn.run(app, host="0.0.0.0", port=8005)

8
configs/__init__.py Executable file
View File

@ -0,0 +1,8 @@
from .basic_config import *
from .model_config import *
from .kb_config import *
from .server_config import *
from .prompt_config import *
VERSION = "v0.2.7"

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

25
configs/basic_config.py Executable file
View File

@ -0,0 +1,25 @@
import logging
import os
import langchain
# 是否显示详细日志
log_verbose = False
langchain.verbose = False
# 是否保存聊天记录
SAVE_CHAT_HISTORY = False
# 通常情况下不需要更改以下内容
# 日志格式
LOG_FORMAT = "%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s"
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.basicConfig(format=LOG_FORMAT)
# 日志存储路径
LOG_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "logs")
if not os.path.exists(LOG_PATH):
os.mkdir(LOG_PATH)

25
configs/basic_config.py.example Executable file
View File

@ -0,0 +1,25 @@
import logging
import os
import langchain
# 是否显示详细日志
log_verbose = False
langchain.verbose = False
# 是否保存聊天记录
SAVE_CHAT_HISTORY = False
# 通常情况下不需要更改以下内容
# 日志格式
LOG_FORMAT = "%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s"
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.basicConfig(format=LOG_FORMAT)
# 日志存储路径
LOG_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "logs")
if not os.path.exists(LOG_PATH):
os.mkdir(LOG_PATH)

132
configs/kb_config.py Executable file
View File

@ -0,0 +1,132 @@
import os
# 默认使用的知识库
DEFAULT_KNOWLEDGE_BASE = "ceshi"
# 默认向量库/全文检索引擎类型。可选faiss, milvus(离线) & zilliz(在线), pgvector,全文检索引擎es
DEFAULT_VS_TYPE = "faiss"
# 缓存向量库数量针对FAISS
CACHED_VS_NUM = 1
# 知识库中单段文本长度(不适用MarkdownHeaderTextSplitter)
CHUNK_SIZE = 1500
# 知识库中相邻文本重合长度(不适用MarkdownHeaderTextSplitter)
OVERLAP_SIZE = 300
# 知识库匹配向量数量
VECTOR_SEARCH_TOP_K = 3
# 知识库匹配相关度阈值取值范围在0-1之间SCORE越小相关度越高取到1相当于不筛选建议设置在0.5左右
SCORE_THRESHOLD = 1
# 默认搜索引擎。可选bing, duckduckgo, metaphor
DEFAULT_SEARCH_ENGINE = "duckduckgo"
# 搜索引擎匹配结题数量
SEARCH_ENGINE_TOP_K = 3
# Bing 搜索必备变量
# 使用 Bing 搜索需要使用 Bing Subscription Key,需要在azure port中申请试用bing search
# 具体申请方式请见
# https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/create-bing-search-service-resource
# 使用python创建bing api 搜索实例详见:
# https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/quickstarts/rest/python
BING_SEARCH_URL = "https://api.bing.microsoft.com/v7.0/search"
# 注意不是bing Webmaster Tools的api key
# 此外如果是在服务器上报Failed to establish a new connection: [Errno 110] Connection timed out
# 是因为服务器加了防火墙需要联系管理员加白名单如果公司的服务器的话就别想了GG
BING_SUBSCRIPTION_KEY = ""
# metaphor搜索需要KEY
METAPHOR_API_KEY = ""
# 是否开启中文标题加强,以及标题增强的相关配置
# 通过增加标题判断判断哪些文本为标题并在metadata中进行标记
# 然后将文本与往上一级的标题进行拼合,实现文本信息的增强。
ZH_TITLE_ENHANCE = False
# 每个知识库的初始化介绍用于在初始化知识库时显示和Agent调用没写则没有介绍不会被Agent调用。
KB_INFO = {
"知识库名称": "知识库介绍",
"samples": "关于本项目issue的解答",
}
# 通常情况下不需要更改以下内容
# 知识库默认存储路径
KB_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "knowledge_base")
if not os.path.exists(KB_ROOT_PATH):
os.mkdir(KB_ROOT_PATH)
# 数据库默认存储路径。
# 如果使用sqlite可以直接修改DB_ROOT_PATH如果使用其它数据库请直接修改SQLALCHEMY_DATABASE_URI。
DB_ROOT_PATH = os.path.join(KB_ROOT_PATH, "info.db")
SQLALCHEMY_DATABASE_URI = f"sqlite:///{DB_ROOT_PATH}"
# 可选向量库类型及对应配置
kbs_config = {
"faiss": {
},
"milvus": {
"host": "60.204.152.17",
"port": "19530",
"user": "root",
"password": "milvus_huawei_cloud",
"secure": False,
},
"zilliz": {
"host": "in01-a7ce524e41e3935.ali-cn-hangzhou.vectordb.zilliz.com.cn",
"port": "19530",
"user": "",
"password": "",
"secure": True,
},
"pg": {
"connection_uri": "postgresql://postgres:postgres@127.0.0.1:5432/langchain_chatchat",
},
"es": {
"host": "127.0.0.1",
"port": "9200",
"index_name": "test_index",
"user": "",
"password": ""
}
}
# TextSplitter配置项如果你不明白其中的含义就不要修改。
text_splitter_dict = {
"ChineseRecursiveTextSplitter": {
"source": "", ## 选择tiktoken则使用openai的方法
"tokenizer_name_or_path": "",
},
"SpacyTextSplitter": {
"source": "huggingface",
"tokenizer_name_or_path": "gpt2",
},
"RecursiveCharacterTextSplitter": {
"source": "tiktoken",
"tokenizer_name_or_path": "cl100k_base",
},
"MarkdownHeaderTextSplitter": {
"headers_to_split_on":
[
("#", "head1"),
("##", "head2"),
("###", "head3"),
("####", "head4"),
]
},
}
# TEXT_SPLITTER 名称
TEXT_SPLITTER_NAME = "ChineseRecursiveTextSplitter"
# Embedding模型定制词语的词表文件
EMBEDDING_KEYWORD_FILE = "embedding_keywords.txt"

132
configs/kb_config.py.example Executable file
View File

@ -0,0 +1,132 @@
import os
# 默认使用的知识库
DEFAULT_KNOWLEDGE_BASE = "samples"
# 默认向量库/全文检索引擎类型。可选faiss, milvus(离线) & zilliz(在线), pgvector,全文检索引擎es
DEFAULT_VS_TYPE = "faiss"
# 缓存向量库数量针对FAISS
CACHED_VS_NUM = 1
# 知识库中单段文本长度(不适用MarkdownHeaderTextSplitter)
CHUNK_SIZE = 250
# 知识库中相邻文本重合长度(不适用MarkdownHeaderTextSplitter)
OVERLAP_SIZE = 50
# 知识库匹配向量数量
VECTOR_SEARCH_TOP_K = 3
# 知识库匹配相关度阈值取值范围在0-1之间SCORE越小相关度越高取到1相当于不筛选建议设置在0.5左右
SCORE_THRESHOLD = 1
# 默认搜索引擎。可选bing, duckduckgo, metaphor
DEFAULT_SEARCH_ENGINE = "duckduckgo"
# 搜索引擎匹配结题数量
SEARCH_ENGINE_TOP_K = 3
# Bing 搜索必备变量
# 使用 Bing 搜索需要使用 Bing Subscription Key,需要在azure port中申请试用bing search
# 具体申请方式请见
# https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/create-bing-search-service-resource
# 使用python创建bing api 搜索实例详见:
# https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/quickstarts/rest/python
BING_SEARCH_URL = "https://api.bing.microsoft.com/v7.0/search"
# 注意不是bing Webmaster Tools的api key
# 此外如果是在服务器上报Failed to establish a new connection: [Errno 110] Connection timed out
# 是因为服务器加了防火墙需要联系管理员加白名单如果公司的服务器的话就别想了GG
BING_SUBSCRIPTION_KEY = ""
# metaphor搜索需要KEY
METAPHOR_API_KEY = ""
# 是否开启中文标题加强,以及标题增强的相关配置
# 通过增加标题判断判断哪些文本为标题并在metadata中进行标记
# 然后将文本与往上一级的标题进行拼合,实现文本信息的增强。
ZH_TITLE_ENHANCE = False
# 每个知识库的初始化介绍用于在初始化知识库时显示和Agent调用没写则没有介绍不会被Agent调用。
KB_INFO = {
"知识库名称": "知识库介绍",
"samples": "关于本项目issue的解答",
}
# 通常情况下不需要更改以下内容
# 知识库默认存储路径
KB_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "knowledge_base")
if not os.path.exists(KB_ROOT_PATH):
os.mkdir(KB_ROOT_PATH)
# 数据库默认存储路径。
# 如果使用sqlite可以直接修改DB_ROOT_PATH如果使用其它数据库请直接修改SQLALCHEMY_DATABASE_URI。
DB_ROOT_PATH = os.path.join(KB_ROOT_PATH, "info.db")
SQLALCHEMY_DATABASE_URI = f"sqlite:///{DB_ROOT_PATH}"
# 可选向量库类型及对应配置
kbs_config = {
"faiss": {
},
"milvus": {
"host": "127.0.0.1",
"port": "19530",
"user": "",
"password": "",
"secure": False,
},
"zilliz": {
"host": "in01-a7ce524e41e3935.ali-cn-hangzhou.vectordb.zilliz.com.cn",
"port": "19530",
"user": "",
"password": "",
"secure": True,
},
"pg": {
"connection_uri": "postgresql://postgres:postgres@127.0.0.1:5432/langchain_chatchat",
},
"es": {
"host": "127.0.0.1",
"port": "9200",
"index_name": "test_index",
"user": "",
"password": ""
}
}
# TextSplitter配置项如果你不明白其中的含义就不要修改。
text_splitter_dict = {
"ChineseRecursiveTextSplitter": {
"source": "huggingface", ## 选择tiktoken则使用openai的方法
"tokenizer_name_or_path": "",
},
"SpacyTextSplitter": {
"source": "huggingface",
"tokenizer_name_or_path": "gpt2",
},
"RecursiveCharacterTextSplitter": {
"source": "tiktoken",
"tokenizer_name_or_path": "cl100k_base",
},
"MarkdownHeaderTextSplitter": {
"headers_to_split_on":
[
("#", "head1"),
("##", "head2"),
("###", "head3"),
("####", "head4"),
]
},
}
# TEXT_SPLITTER 名称
TEXT_SPLITTER_NAME = "ChineseRecursiveTextSplitter"
# Embedding模型定制词语的词表文件
EMBEDDING_KEYWORD_FILE = "embedding_keywords.txt"

273
configs/model_config.py Executable file
View File

@ -0,0 +1,273 @@
import os
# 可以指定一个绝对路径统一存放所有的Embedding和LLM模型。
# 每个模型可以是一个单独的目录,也可以是某个目录下的二级子目录。
# 如果模型目录名称和 MODEL_PATH 中的 key 或 value 相同,程序会自动检测加载,无需修改 MODEL_PATH 中的路径。
MODEL_ROOT_PATH = ""
# 选用的 Embedding 名称
EMBEDDING_MODEL = "m3e-base" # bge-large-zh
# Embedding 模型运行设备。设为"auto"会自动检测,也可手动设定为"cuda","mps","cpu"其中之一。
EMBEDDING_DEVICE = "auto"
# 如果需要在 EMBEDDING_MODEL 中增加自定义的关键字时配置
EMBEDDING_KEYWORD_FILE = "keywords.txt"
EMBEDDING_MODEL_OUTPUT_PATH = "output"
# 要运行的 LLM 名称,可以包括本地模型和在线模型。
# 第一个将作为 API 和 WEBUI 的默认模型
LLM_MODELS = ["chatglm3-6b"]
# AgentLM模型的名称 (可以不指定指定之后就锁定进入Agent之后的Chain的模型不指定就是LLM_MODELS[0])
Agent_MODEL = None
# LLM 运行设备。设为"auto"会自动检测,也可手动设定为"cuda","mps","cpu"其中之一。
LLM_DEVICE = "auto"
# 历史对话轮数
HISTORY_LEN = 3
# 大模型最长支持的长度,如果不填写,则使用模型默认的最大长度,如果填写,则为用户设定的最大长度
MAX_TOKENS = None
# LLM通用对话参数
TEMPERATURE = 0.7
# TOP_P = 0.95 # ChatOpenAI暂不支持该参数
ONLINE_LLM_MODEL = {
# 线上模型。请在server_config中为每个在线API设置不同的端口
"openai-api": {
"model_name": "gpt-35-turbo",
"api_base_url": "https://api.openai.com/v1",
"api_key": "",
"openai_proxy": "",
},
# 具体注册及api key获取请前往 http://open.bigmodel.cn
"zhipu-api": {
"api_key": "",
"version": "chatglm_turbo", # 可选包括 "chatglm_turbo"
"provider": "ChatGLMWorker",
},
# 具体注册及api key获取请前往 https://api.minimax.chat/
"minimax-api": {
"group_id": "",
"api_key": "",
"is_pro": False,
"provider": "MiniMaxWorker",
},
# 具体注册及api key获取请前往 https://xinghuo.xfyun.cn/
"xinghuo-api": {
"APPID": "",
"APISecret": "",
"api_key": "",
"version": "v1.5", # 你使用的讯飞星火大模型版本,可选包括 "v3.0", "v1.5", "v2.0"
"provider": "XingHuoWorker",
},
# 百度千帆 API申请方式请参考 https://cloud.baidu.com/doc/WENXINWORKSHOP/s/4lilb2lpf
"qianfan-api": {
"version": "ERNIE-Bot", # 注意大小写。当前支持 "ERNIE-Bot" 或 "ERNIE-Bot-turbo" 更多的见官方文档。
"version_url": "", # 也可以不填写version直接填写在千帆申请模型发布的API地址
"api_key": "",
"secret_key": "",
"provider": "QianFanWorker",
},
# 火山方舟 API文档参考 https://www.volcengine.com/docs/82379
"fangzhou-api": {
"version": "chatglm-6b-model", # 当前支持 "chatglm-6b-model" 更多的见文档模型支持列表中方舟部分。
"version_url": "", # 可以不填写version直接填写在方舟申请模型发布的API地址
"api_key": "",
"secret_key": "",
"provider": "FangZhouWorker",
},
# 阿里云通义千问 API文档参考 https://help.aliyun.com/zh/dashscope/developer-reference/api-details
"qwen-api": {
"version": "qwen-turbo", # 可选包括 "qwen-turbo", "qwen-plus"
"api_key": "", # 请在阿里云控制台模型服务灵积API-KEY管理页面创建
"provider": "QwenWorker",
},
# 百川 API申请方式请参考 https://www.baichuan-ai.com/home#api-enter
"baichuan-api": {
"version": "Baichuan2-53B", # 当前支持 "Baichuan2-53B" 见官方文档。
"api_key": "",
"secret_key": "",
"provider": "BaiChuanWorker",
},
# Azure API
"azure-api": {
"deployment_name": "", # 部署容器的名字
"resource_name": "", # https://{resource_name}.openai.azure.com/openai/ 填写resource_name的部分其他部分不要填写
"api_version": "", # API的版本不是模型版本
"api_key": "",
"provider": "AzureWorker",
},
}
# 在以下字典中修改属性值以指定本地embedding模型存储位置。支持3种设置方法
# 1、将对应的值修改为模型绝对路径
# 2、不修改此处的值以 text2vec 为例):
# 2.1 如果{MODEL_ROOT_PATH}下存在如下任一子目录:
# - text2vec
# - GanymedeNil/text2vec-large-chinese
# - text2vec-large-chinese
# 2.2 如果以上本地路径不存在则使用huggingface模型
MODEL_PATH = {
"embed_model": {
"ernie-tiny": "nghuyong/ernie-3.0-nano-zh",
"ernie-base": "nghuyong/ernie-3.0-base-zh",
"text2vec-base": "shibing624/text2vec-base-chinese",
"text2vec": "GanymedeNil/text2vec-large-chinese",
"text2vec-paraphrase": "shibing624/text2vec-base-chinese-paraphrase",
"text2vec-sentence": "shibing624/text2vec-base-chinese-sentence",
"text2vec-multilingual": "shibing624/text2vec-base-multilingual",
"text2vec-bge-large-chinese": "shibing624/text2vec-bge-large-chinese",
"m3e-small": "moka-ai/m3e-small",
"m3e-base": "moka-ai/m3e-base",
"m3e-large": "moka-ai/m3e-large",
"bge-small-zh": "BAAI/bge-small-zh",
"bge-base-zh": "BAAI/bge-base-zh",
"bge-large-zh": "BAAI/bge-large-zh",
"bge-large-zh-noinstruct": "BAAI/bge-large-zh-noinstruct",
"bge-base-zh-v1.5": "BAAI/bge-base-zh-v1.5",
"bge-large-zh-v1.5": "BAAI/bge-large-zh-v1.5",
"piccolo-base-zh": "sensenova/piccolo-base-zh",
"piccolo-large-zh": "sensenova/piccolo-large-zh",
"text-embedding-ada-002": "your OPENAI_API_KEY",
},
"llm_model": {
# 以下部分模型并未完全测试仅根据fastchat和vllm模型的模型列表推定支持
"chatglm2-6b": "THUDM/chatglm2-6b",
"chatglm2-6b-32k": "THUDM/chatglm2-6b-32k",
"baichuan2-13b": "baichuan-inc/Baichuan2-13B-Chat",
"baichuan2-7b": "baichuan-inc/Baichuan2-7B-Chat",
"baichuan-7b": "baichuan-inc/Baichuan-7B",
"baichuan-13b": "baichuan-inc/Baichuan-13B",
'baichuan-13b-chat': 'baichuan-inc/Baichuan-13B-Chat',
"aquila-7b": "BAAI/Aquila-7B",
"aquilachat-7b": "BAAI/AquilaChat-7B",
"internlm-7b": "internlm/internlm-7b",
"internlm-chat-7b": "internlm/internlm-chat-7b",
"falcon-7b": "tiiuae/falcon-7b",
"falcon-40b": "tiiuae/falcon-40b",
"falcon-rw-7b": "tiiuae/falcon-rw-7b",
"gpt2": "gpt2",
"gpt2-xl": "gpt2-xl",
"gpt-j-6b": "EleutherAI/gpt-j-6b",
"gpt4all-j": "nomic-ai/gpt4all-j",
"gpt-neox-20b": "EleutherAI/gpt-neox-20b",
"pythia-12b": "EleutherAI/pythia-12b",
"oasst-sft-4-pythia-12b-epoch-3.5": "OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5",
"dolly-v2-12b": "databricks/dolly-v2-12b",
"stablelm-tuned-alpha-7b": "stabilityai/stablelm-tuned-alpha-7b",
"Llama-2-13b-hf": "meta-llama/Llama-2-13b-hf",
"Llama-2-70b-hf": "meta-llama/Llama-2-70b-hf",
"open_llama_13b": "openlm-research/open_llama_13b",
"vicuna-13b-v1.3": "lmsys/vicuna-13b-v1.3",
"koala": "young-geng/koala",
"mpt-7b": "mosaicml/mpt-7b",
"mpt-7b-storywriter": "mosaicml/mpt-7b-storywriter",
"mpt-30b": "mosaicml/mpt-30b",
"opt-66b": "facebook/opt-66b",
"opt-iml-max-30b": "facebook/opt-iml-max-30b",
"Qwen-7B": "Qwen/Qwen-7B",
"Qwen-14B": "Qwen/Qwen-14B",
"Qwen-7B-Chat": "Qwen/Qwen-7B-Chat",
"Qwen-14B-Chat": "Qwen/Qwen-14B-Chat",
"Qwen-14B-Chat-Int8": "Qwen/Qwen-14B-Chat-Int8", # 确保已经安装了auto-gptq optimum flash-attn
"Qwen-14B-Chat-Int4": "Qwen/Qwen-14B-Chat-Int4", # 确保已经安装了auto-gptq optimum flash-attn
},
}
# 通常情况下不需要更改以下内容
# nltk 模型存储路径
NLTK_DATA_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "nltk_data")
VLLM_MODEL_DICT = {
"aquila-7b": "BAAI/Aquila-7B",
"aquilachat-7b": "BAAI/AquilaChat-7B",
"baichuan-7b": "baichuan-inc/Baichuan-7B",
"baichuan-13b": "baichuan-inc/Baichuan-13B",
'baichuan-13b-chat': 'baichuan-inc/Baichuan-13B-Chat',
# 注意bloom系列的tokenizer与model是分离的因此虽然vllm支持但与fschat框架不兼容
# "bloom":"bigscience/bloom",
# "bloomz":"bigscience/bloomz",
# "bloomz-560m":"bigscience/bloomz-560m",
# "bloomz-7b1":"bigscience/bloomz-7b1",
# "bloomz-1b7":"bigscience/bloomz-1b7",
"internlm-7b": "internlm/internlm-7b",
"internlm-chat-7b": "internlm/internlm-chat-7b",
"falcon-7b": "tiiuae/falcon-7b",
"falcon-40b": "tiiuae/falcon-40b",
"falcon-rw-7b": "tiiuae/falcon-rw-7b",
"gpt2": "gpt2",
"gpt2-xl": "gpt2-xl",
"gpt-j-6b": "EleutherAI/gpt-j-6b",
"gpt4all-j": "nomic-ai/gpt4all-j",
"gpt-neox-20b": "EleutherAI/gpt-neox-20b",
"pythia-12b": "EleutherAI/pythia-12b",
"oasst-sft-4-pythia-12b-epoch-3.5": "OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5",
"dolly-v2-12b": "databricks/dolly-v2-12b",
"stablelm-tuned-alpha-7b": "stabilityai/stablelm-tuned-alpha-7b",
"Llama-2-13b-hf": "meta-llama/Llama-2-13b-hf",
"Llama-2-70b-hf": "meta-llama/Llama-2-70b-hf",
"open_llama_13b": "openlm-research/open_llama_13b",
"vicuna-13b-v1.3": "lmsys/vicuna-13b-v1.3",
"koala": "young-geng/koala",
"mpt-7b": "mosaicml/mpt-7b",
"mpt-7b-storywriter": "mosaicml/mpt-7b-storywriter",
"mpt-30b": "mosaicml/mpt-30b",
"opt-66b": "facebook/opt-66b",
"opt-iml-max-30b": "facebook/opt-iml-max-30b",
"Qwen-7B": "Qwen/Qwen-7B",
"Qwen-14B": "Qwen/Qwen-14B",
"Qwen-7B-Chat": "Qwen/Qwen-7B-Chat",
"Qwen-14B-Chat": "Qwen/Qwen-14B-Chat",
"agentlm-7b": "THUDM/agentlm-7b",
"agentlm-13b": "THUDM/agentlm-13b",
"agentlm-70b": "THUDM/agentlm-70b",
}
# 你认为支持Agent能力的模型可以在这里添加添加后不会出现可视化界面的警告
SUPPORT_AGENT_MODEL = [
"azure-api",
"openai-api",
"claude-api",
"zhipu-api",
"qwen-api",
"Qwen",
"baichuan-api",
"agentlm",
"chatglm3",
"xinghuo-api",
]

273
configs/model_config.py.example Executable file
View File

@ -0,0 +1,273 @@
import os
# 可以指定一个绝对路径统一存放所有的Embedding和LLM模型。
# 每个模型可以是一个单独的目录,也可以是某个目录下的二级子目录。
# 如果模型目录名称和 MODEL_PATH 中的 key 或 value 相同,程序会自动检测加载,无需修改 MODEL_PATH 中的路径。
MODEL_ROOT_PATH = ""
# 选用的 Embedding 名称
EMBEDDING_MODEL = "m3e-base" # bge-large-zh
# Embedding 模型运行设备。设为"auto"会自动检测,也可手动设定为"cuda","mps","cpu"其中之一。
EMBEDDING_DEVICE = "auto"
# 如果需要在 EMBEDDING_MODEL 中增加自定义的关键字时配置
EMBEDDING_KEYWORD_FILE = "keywords.txt"
EMBEDDING_MODEL_OUTPUT_PATH = "output"
# 要运行的 LLM 名称,可以包括本地模型和在线模型。
# 第一个将作为 API 和 WEBUI 的默认模型
LLM_MODELS = ["chatglm2-6b", "zhipu-api", "openai-api"]
# AgentLM模型的名称 (可以不指定指定之后就锁定进入Agent之后的Chain的模型不指定就是LLM_MODELS[0])
Agent_MODEL = None
# LLM 运行设备。设为"auto"会自动检测,也可手动设定为"cuda","mps","cpu"其中之一。
LLM_DEVICE = "auto"
# 历史对话轮数
HISTORY_LEN = 3
# 大模型最长支持的长度,如果不填写,则使用模型默认的最大长度,如果填写,则为用户设定的最大长度
MAX_TOKENS = None
# LLM通用对话参数
TEMPERATURE = 0.7
# TOP_P = 0.95 # ChatOpenAI暂不支持该参数
ONLINE_LLM_MODEL = {
# 线上模型。请在server_config中为每个在线API设置不同的端口
"openai-api": {
"model_name": "gpt-35-turbo",
"api_base_url": "https://api.openai.com/v1",
"api_key": "",
"openai_proxy": "",
},
# 具体注册及api key获取请前往 http://open.bigmodel.cn
"zhipu-api": {
"api_key": "",
"version": "chatglm_turbo", # 可选包括 "chatglm_turbo"
"provider": "ChatGLMWorker",
},
# 具体注册及api key获取请前往 https://api.minimax.chat/
"minimax-api": {
"group_id": "",
"api_key": "",
"is_pro": False,
"provider": "MiniMaxWorker",
},
# 具体注册及api key获取请前往 https://xinghuo.xfyun.cn/
"xinghuo-api": {
"APPID": "",
"APISecret": "",
"api_key": "",
"version": "v1.5", # 你使用的讯飞星火大模型版本,可选包括 "v3.0", "v1.5", "v2.0"
"provider": "XingHuoWorker",
},
# 百度千帆 API申请方式请参考 https://cloud.baidu.com/doc/WENXINWORKSHOP/s/4lilb2lpf
"qianfan-api": {
"version": "ERNIE-Bot", # 注意大小写。当前支持 "ERNIE-Bot" 或 "ERNIE-Bot-turbo" 更多的见官方文档。
"version_url": "", # 也可以不填写version直接填写在千帆申请模型发布的API地址
"api_key": "",
"secret_key": "",
"provider": "QianFanWorker",
},
# 火山方舟 API文档参考 https://www.volcengine.com/docs/82379
"fangzhou-api": {
"version": "chatglm-6b-model", # 当前支持 "chatglm-6b-model" 更多的见文档模型支持列表中方舟部分。
"version_url": "", # 可以不填写version直接填写在方舟申请模型发布的API地址
"api_key": "",
"secret_key": "",
"provider": "FangZhouWorker",
},
# 阿里云通义千问 API文档参考 https://help.aliyun.com/zh/dashscope/developer-reference/api-details
"qwen-api": {
"version": "qwen-turbo", # 可选包括 "qwen-turbo", "qwen-plus"
"api_key": "", # 请在阿里云控制台模型服务灵积API-KEY管理页面创建
"provider": "QwenWorker",
},
# 百川 API申请方式请参考 https://www.baichuan-ai.com/home#api-enter
"baichuan-api": {
"version": "Baichuan2-53B", # 当前支持 "Baichuan2-53B" 见官方文档。
"api_key": "",
"secret_key": "",
"provider": "BaiChuanWorker",
},
# Azure API
"azure-api": {
"deployment_name": "", # 部署容器的名字
"resource_name": "", # https://{resource_name}.openai.azure.com/openai/ 填写resource_name的部分其他部分不要填写
"api_version": "", # API的版本不是模型版本
"api_key": "",
"provider": "AzureWorker",
},
}
# 在以下字典中修改属性值以指定本地embedding模型存储位置。支持3种设置方法
# 1、将对应的值修改为模型绝对路径
# 2、不修改此处的值以 text2vec 为例):
# 2.1 如果{MODEL_ROOT_PATH}下存在如下任一子目录:
# - text2vec
# - GanymedeNil/text2vec-large-chinese
# - text2vec-large-chinese
# 2.2 如果以上本地路径不存在则使用huggingface模型
MODEL_PATH = {
"embed_model": {
"ernie-tiny": "nghuyong/ernie-3.0-nano-zh",
"ernie-base": "nghuyong/ernie-3.0-base-zh",
"text2vec-base": "shibing624/text2vec-base-chinese",
"text2vec": "GanymedeNil/text2vec-large-chinese",
"text2vec-paraphrase": "shibing624/text2vec-base-chinese-paraphrase",
"text2vec-sentence": "shibing624/text2vec-base-chinese-sentence",
"text2vec-multilingual": "shibing624/text2vec-base-multilingual",
"text2vec-bge-large-chinese": "shibing624/text2vec-bge-large-chinese",
"m3e-small": "moka-ai/m3e-small",
"m3e-base": "moka-ai/m3e-base",
"m3e-large": "moka-ai/m3e-large",
"bge-small-zh": "BAAI/bge-small-zh",
"bge-base-zh": "BAAI/bge-base-zh",
"bge-large-zh": "BAAI/bge-large-zh",
"bge-large-zh-noinstruct": "BAAI/bge-large-zh-noinstruct",
"bge-base-zh-v1.5": "BAAI/bge-base-zh-v1.5",
"bge-large-zh-v1.5": "BAAI/bge-large-zh-v1.5",
"piccolo-base-zh": "sensenova/piccolo-base-zh",
"piccolo-large-zh": "sensenova/piccolo-large-zh",
"text-embedding-ada-002": "your OPENAI_API_KEY",
},
"llm_model": {
# 以下部分模型并未完全测试仅根据fastchat和vllm模型的模型列表推定支持
"chatglm2-6b": "THUDM/chatglm2-6b",
"chatglm2-6b-32k": "THUDM/chatglm2-6b-32k",
"baichuan2-13b": "baichuan-inc/Baichuan2-13B-Chat",
"baichuan2-7b": "baichuan-inc/Baichuan2-7B-Chat",
"baichuan-7b": "baichuan-inc/Baichuan-7B",
"baichuan-13b": "baichuan-inc/Baichuan-13B",
'baichuan-13b-chat': 'baichuan-inc/Baichuan-13B-Chat',
"aquila-7b": "BAAI/Aquila-7B",
"aquilachat-7b": "BAAI/AquilaChat-7B",
"internlm-7b": "internlm/internlm-7b",
"internlm-chat-7b": "internlm/internlm-chat-7b",
"falcon-7b": "tiiuae/falcon-7b",
"falcon-40b": "tiiuae/falcon-40b",
"falcon-rw-7b": "tiiuae/falcon-rw-7b",
"gpt2": "gpt2",
"gpt2-xl": "gpt2-xl",
"gpt-j-6b": "EleutherAI/gpt-j-6b",
"gpt4all-j": "nomic-ai/gpt4all-j",
"gpt-neox-20b": "EleutherAI/gpt-neox-20b",
"pythia-12b": "EleutherAI/pythia-12b",
"oasst-sft-4-pythia-12b-epoch-3.5": "OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5",
"dolly-v2-12b": "databricks/dolly-v2-12b",
"stablelm-tuned-alpha-7b": "stabilityai/stablelm-tuned-alpha-7b",
"Llama-2-13b-hf": "meta-llama/Llama-2-13b-hf",
"Llama-2-70b-hf": "meta-llama/Llama-2-70b-hf",
"open_llama_13b": "openlm-research/open_llama_13b",
"vicuna-13b-v1.3": "lmsys/vicuna-13b-v1.3",
"koala": "young-geng/koala",
"mpt-7b": "mosaicml/mpt-7b",
"mpt-7b-storywriter": "mosaicml/mpt-7b-storywriter",
"mpt-30b": "mosaicml/mpt-30b",
"opt-66b": "facebook/opt-66b",
"opt-iml-max-30b": "facebook/opt-iml-max-30b",
"Qwen-7B": "Qwen/Qwen-7B",
"Qwen-14B": "Qwen/Qwen-14B",
"Qwen-7B-Chat": "Qwen/Qwen-7B-Chat",
"Qwen-14B-Chat": "Qwen/Qwen-14B-Chat",
"Qwen-14B-Chat-Int8": "Qwen/Qwen-14B-Chat-Int8", # 确保已经安装了auto-gptq optimum flash-attn
"Qwen-14B-Chat-Int4": "Qwen/Qwen-14B-Chat-Int4", # 确保已经安装了auto-gptq optimum flash-attn
},
}
# 通常情况下不需要更改以下内容
# nltk 模型存储路径
NLTK_DATA_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "nltk_data")
VLLM_MODEL_DICT = {
"aquila-7b": "BAAI/Aquila-7B",
"aquilachat-7b": "BAAI/AquilaChat-7B",
"baichuan-7b": "baichuan-inc/Baichuan-7B",
"baichuan-13b": "baichuan-inc/Baichuan-13B",
'baichuan-13b-chat': 'baichuan-inc/Baichuan-13B-Chat',
# 注意bloom系列的tokenizer与model是分离的因此虽然vllm支持但与fschat框架不兼容
# "bloom":"bigscience/bloom",
# "bloomz":"bigscience/bloomz",
# "bloomz-560m":"bigscience/bloomz-560m",
# "bloomz-7b1":"bigscience/bloomz-7b1",
# "bloomz-1b7":"bigscience/bloomz-1b7",
"internlm-7b": "internlm/internlm-7b",
"internlm-chat-7b": "internlm/internlm-chat-7b",
"falcon-7b": "tiiuae/falcon-7b",
"falcon-40b": "tiiuae/falcon-40b",
"falcon-rw-7b": "tiiuae/falcon-rw-7b",
"gpt2": "gpt2",
"gpt2-xl": "gpt2-xl",
"gpt-j-6b": "EleutherAI/gpt-j-6b",
"gpt4all-j": "nomic-ai/gpt4all-j",
"gpt-neox-20b": "EleutherAI/gpt-neox-20b",
"pythia-12b": "EleutherAI/pythia-12b",
"oasst-sft-4-pythia-12b-epoch-3.5": "OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5",
"dolly-v2-12b": "databricks/dolly-v2-12b",
"stablelm-tuned-alpha-7b": "stabilityai/stablelm-tuned-alpha-7b",
"Llama-2-13b-hf": "meta-llama/Llama-2-13b-hf",
"Llama-2-70b-hf": "meta-llama/Llama-2-70b-hf",
"open_llama_13b": "openlm-research/open_llama_13b",
"vicuna-13b-v1.3": "lmsys/vicuna-13b-v1.3",
"koala": "young-geng/koala",
"mpt-7b": "mosaicml/mpt-7b",
"mpt-7b-storywriter": "mosaicml/mpt-7b-storywriter",
"mpt-30b": "mosaicml/mpt-30b",
"opt-66b": "facebook/opt-66b",
"opt-iml-max-30b": "facebook/opt-iml-max-30b",
"Qwen-7B": "Qwen/Qwen-7B",
"Qwen-14B": "Qwen/Qwen-14B",
"Qwen-7B-Chat": "Qwen/Qwen-7B-Chat",
"Qwen-14B-Chat": "Qwen/Qwen-14B-Chat",
"agentlm-7b": "THUDM/agentlm-7b",
"agentlm-13b": "THUDM/agentlm-13b",
"agentlm-70b": "THUDM/agentlm-70b",
}
# 你认为支持Agent能力的模型可以在这里添加添加后不会出现可视化界面的警告
SUPPORT_AGENT_MODEL = [
"azure-api",
"openai-api",
"claude-api",
"zhipu-api",
"qwen-api",
"Qwen",
"baichuan-api",
"agentlm",
"chatglm3",
"xinghuo-api",
]

158
configs/prompt_config.py Executable file
View File

@ -0,0 +1,158 @@
# prompt模板使用Jinja2语法简单点就是用双大括号代替f-string的单大括号
# 本配置文件支持热加载修改prompt模板后无需重启服务。
# LLM对话支持的变量
# - input: 用户输入内容
# 知识库和搜索引擎对话支持的变量:
# - context: 从检索结果拼接的知识文本
# - question: 用户提出的问题
# Agent对话支持的变量
# - tools: 可用的工具列表
# - tool_names: 可用的工具名称列表
# - history: 用户和Agent的对话历史
# - input: 用户输入内容
# - agent_scratchpad: Agent的思维记录
PROMPT_TEMPLATES = {
"completion": {
"default": "{input}"
},
"llm_chat": {
"default": "{{ input }}",
"py":
"""
你是一个聪明的代码助手请你给我写出简单的py代码 \n
{{ input }}
"""
,
},
"knowledge_base_chat": {
"default":
"""
<指令>根据已知信息简洁和专业的来回答问题如果无法从中得到答案请说 根据已知信息无法回答该问题不允许在答案中添加编造成分答案请使用中文 </指令>
<已知信息>{{ context }}</已知信息>
<问题>{{ question }}</问题>
""",
"text":
"""
<指令>根据已知信息简洁和专业的来回答问题如果无法从中得到答案请说 根据已知信息无法回答该问题答案请使用中文 </指令>
<已知信息>{{ context }}</已知信息>
<问题>{{ question }}</问题>
""",
"Empty": # 搜不到内容的时候调用此时没有已知信息这个Empty可以更改但不能删除会影响程序使用
"""
<指令>请根据用户的问题进行简洁明了的回答</指令>
<问题>{{ question }}</问题>
""",
},
"search_engine_chat": {
"default":
"""
<指令>这是我搜索到的互联网信息请你根据这些信息进行提取并有调理简洁的回答问题如果无法从中得到答案请说 无法搜索到能回答问题的内容 </指令>
<已知信息>{{ context }}</已知信息>
<问题>{{ question }}</问题>
""",
"search":
"""
<指令>根据已知信息简洁和专业的来回答问题如果无法从中得到答案请说 根据已知信息无法回答该问题答案请使用中文 </指令>
<已知信息>{{ context }}</已知信息>
<问题>{{ question }}</问题>
""",
"Empty": # 搜不到内容的时候调用此时没有已知信息这个Empty可以更改但不能删除会影响程序使用
"""
<指令>请根据用户的问题进行简洁明了的回答</指令>
<问题>{{ question }}</问题>
""",
},
"agent_chat": {
"default":
"""
Answer the following questions as best you can. If it is in order, you can use some tools appropriately.You have access to the following tools:
{tools}
Please note that the "知识库查询工具" is information about the "西交利物浦大学" ,and if a question is asked about it, you must answer with the knowledge base
Please note that the "天气查询工具" can only be used once since Question begin.
Use the following format:
Question: the input question you must answer1
Thought: you should always think about what to do and what tools to use.
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can be repeated zero or more times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question
Begin!
history:
{history}
Question: {input}
Thought: {agent_scratchpad}
""",
"AgentLM":
"""
<SYS>>\n
You are a helpful, respectful and honest assistant.
</SYS>>\n
Answer the following questions as best you can. If it is in order, you can use some tools appropriately.You have access to the following tools:
{tools}.
Use the following steps and think step by step!:
Question: the input question you must answer1
Thought: you should always think about what to do and what tools to use.
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can be repeated zero or more times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question
Begin! let's think step by step!
history:
{history}
Question: {input}
Thought: {agent_scratchpad}
""",
"中文版本":
"""
你的知识不一定正确所以你一定要用提供的工具来思考并给出用户答案
你有以下工具可以使用:
{tools}
请请严格按照提供的思维方式来思考所有的关键词都要输出例如ActionAction InputObservation等
```
Question: 用户的提问或者观察到的信息
Thought: 你应该思考该做什么是根据工具的结果来回答问题还是决定使用什么工具
Action: 需要使用的工具应该是在[{tool_names}]中的一个
Action Input: 传入工具的内容
Observation: 工具给出的答案不是你生成的
... (this Thought/Action/Action Input/Observation can be repeated zero or more times)
Thought: 通过工具给出的答案你是否能回答Question
Final Answer是你的答案
现在我们开始
你和用户的历史记录:
History:
{history}
用户开始以提问
Question: {input}
Thought: {agent_scratchpad}
""",
},
}

158
configs/prompt_config.py.example Executable file
View File

@ -0,0 +1,158 @@
# prompt模板使用Jinja2语法简单点就是用双大括号代替f-string的单大括号
# 本配置文件支持热加载修改prompt模板后无需重启服务。
# LLM对话支持的变量
# - input: 用户输入内容
# 知识库和搜索引擎对话支持的变量:
# - context: 从检索结果拼接的知识文本
# - question: 用户提出的问题
# Agent对话支持的变量
# - tools: 可用的工具列表
# - tool_names: 可用的工具名称列表
# - history: 用户和Agent的对话历史
# - input: 用户输入内容
# - agent_scratchpad: Agent的思维记录
PROMPT_TEMPLATES = {
"completion": {
"default": "{input}"
},
"llm_chat": {
"default": "{{ input }}",
"py":
"""
你是一个聪明的代码助手请你给我写出简单的py代码。 \n
{{ input }}
"""
,
},
"knowledge_base_chat": {
"default":
"""
<指令>根据已知信息,简洁和专业的来回答问题。如果无法从中得到答案,请说 “根据已知信息无法回答该问题”,不允许在答案中添加编造成分,答案请使用中文。 </指令>
<已知信息>{{ context }}</已知信息>、
<问题>{{ question }}</问题>
""",
"text":
"""
<指令>根据已知信息,简洁和专业的来回答问题。如果无法从中得到答案,请说 “根据已知信息无法回答该问题”,答案请使用中文。 </指令>
<已知信息>{{ context }}</已知信息>、
<问题>{{ question }}</问题>
""",
"Empty": # 搜不到内容的时候调用此时没有已知信息这个Empty可以更改但不能删除会影响程序使用
"""
<指令>请根据用户的问题,进行简洁明了的回答</指令>
<问题>{{ question }}</问题>
""",
},
"search_engine_chat": {
"default":
"""
<指令>这是我搜索到的互联网信息,请你根据这些信息进行提取并有调理,简洁的回答问题。如果无法从中得到答案,请说 “无法搜索到能回答问题的内容”。 </指令>
<已知信息>{{ context }}</已知信息>、
<问题>{{ question }}</问题>
""",
"search":
"""
<指令>根据已知信息,简洁和专业的来回答问题。如果无法从中得到答案,请说 “根据已知信息无法回答该问题”,答案请使用中文。 </指令>
<已知信息>{{ context }}</已知信息>、
<问题>{{ question }}</问题>
""",
"Empty": # 搜不到内容的时候调用此时没有已知信息这个Empty可以更改但不能删除会影响程序使用
"""
<指令>请根据用户的问题,进行简洁明了的回答</指令>
<问题>{{ question }}</问题>
""",
},
"agent_chat": {
"default":
"""
Answer the following questions as best you can. If it is in order, you can use some tools appropriately.You have access to the following tools:
{tools}
Please note that the "知识库查询工具" is information about the "西交利物浦大学" ,and if a question is asked about it, you must answer with the knowledge base
Please note that the "天气查询工具" can only be used once since Question begin.
Use the following format:
Question: the input question you must answer1
Thought: you should always think about what to do and what tools to use.
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can be repeated zero or more times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question
Begin!
history:
{history}
Question: {input}
Thought: {agent_scratchpad}
""",
"AgentLM":
"""
<SYS>>\n
You are a helpful, respectful and honest assistant.
</SYS>>\n
Answer the following questions as best you can. If it is in order, you can use some tools appropriately.You have access to the following tools:
{tools}.
Use the following steps and think step by step!:
Question: the input question you must answer1
Thought: you should always think about what to do and what tools to use.
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can be repeated zero or more times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question
Begin! let's think step by step!
history:
{history}
Question: {input}
Thought: {agent_scratchpad}
""",
"中文版本":
"""
你的知识不一定正确,所以你一定要用提供的工具来思考,并给出用户答案。
你有以下工具可以使用:
{tools}
请请严格按照提供的思维方式来思考所有的关键词都要输出例如ActionAction InputObservation等
```
Question: 用户的提问或者观察到的信息,
Thought: 你应该思考该做什么,是根据工具的结果来回答问题,还是决定使用什么工具。
Action: 需要使用的工具,应该是在[{tool_names}]中的一个。
Action Input: 传入工具的内容
Observation: 工具给出的答案(不是你生成的)
... (this Thought/Action/Action Input/Observation can be repeated zero or more times)
Thought: 通过工具给出的答案你是否能回答Question。
Final Answer是你的答案
现在,我们开始!
你和用户的历史记录:
History:
{history}
用户开始以提问:
Question: {input}
Thought: {agent_scratchpad}
""",
},
}

135
configs/server_config.py Executable file
View File

@ -0,0 +1,135 @@
import sys
from configs.model_config import LLM_DEVICE
# httpx 请求默认超时时间(秒)。如果加载模型或对话较慢,出现超时错误,可以适当加大该值。
HTTPX_DEFAULT_TIMEOUT = 300.0
# API 是否开启跨域默认为False如果需要开启请设置为True
# is open cross domain
OPEN_CROSS_DOMAIN = False
# 各服务器默认绑定host。如改为"0.0.0.0"需要修改下方所有XX_SERVER的host
DEFAULT_BIND_HOST = "0.0.0.0" if sys.platform != "win32" else "127.0.0.1"
# webui.py server
WEBUI_SERVER = {
"host": DEFAULT_BIND_HOST,
"port": 8501,
}
# api.py server
API_SERVER = {
"host": DEFAULT_BIND_HOST,
"port": 7861,
}
# fastchat openai_api server
FSCHAT_OPENAI_API = {
"host": DEFAULT_BIND_HOST,
"port": 20000,
}
# fastchat model_worker server
# 这些模型必须是在model_config.MODEL_PATH或ONLINE_MODEL中正确配置的。
# 在启动startup.py时可用通过`--model-name xxxx yyyy`指定模型不指定则为LLM_MODELS
FSCHAT_MODEL_WORKERS = {
# 所有模型共用的默认配置,可在模型专项配置中进行覆盖。
"default": {
"host": DEFAULT_BIND_HOST,
"port": 20002,
"device": LLM_DEVICE,
# False,'vllm',使用的推理加速框架,使用vllm如果出现HuggingFace通信问题参见doc/FAQ
# vllm对一些模型支持还不成熟暂时默认关闭
"infer_turbo": False,
# model_worker多卡加载需要配置的参数
# "gpus": None, # 使用的GPU以str的格式指定如"0,1"如失效请使用CUDA_VISIBLE_DEVICES="0,1"等形式指定
# "num_gpus": 1, # 使用GPU的数量
# "max_gpu_memory": "20GiB", # 每个GPU占用的最大显存
# 以下为model_worker非常用参数可根据需要配置
# "load_8bit": False, # 开启8bit量化
# "cpu_offloading": None,
# "gptq_ckpt": None,
# "gptq_wbits": 16,
# "gptq_groupsize": -1,
# "gptq_act_order": False,
# "awq_ckpt": None,
# "awq_wbits": 16,
# "awq_groupsize": -1,
# "model_names": LLM_MODELS,
# "conv_template": None,
# "limit_worker_concurrency": 5,
# "stream_interval": 2,
# "no_register": False,
# "embed_in_truncate": False,
# 以下为vllm_woker配置参数,注意使用vllm必须有gpu仅在Linux测试通过
# tokenizer = model_path # 如果tokenizer与model_path不一致在此处添加
# 'tokenizer_mode':'auto',
# 'trust_remote_code':True,
# 'download_dir':None,
# 'load_format':'auto',
# 'dtype':'auto',
# 'seed':0,
# 'worker_use_ray':False,
# 'pipeline_parallel_size':1,
# 'tensor_parallel_size':1,
# 'block_size':16,
# 'swap_space':4 , # GiB
# 'gpu_memory_utilization':0.90,
# 'max_num_batched_tokens':2560,
# 'max_num_seqs':256,
# 'disable_log_stats':False,
# 'conv_template':None,
# 'limit_worker_concurrency':5,
# 'no_register':False,
# 'num_gpus': 1
# 'engine_use_ray': False,
# 'disable_log_requests': False
},
# 可以如下示例方式更改默认配置
# "baichuan-7b": { # 使用default中的IP和端口
# "device": "cpu",
# },
#以下配置可以不用修改在model_config中设置启动的模型
"zhipu-api": {
"port": 21001,
},
"minimax-api": {
"port": 21002,
},
"xinghuo-api": {
"port": 21003,
},
"qianfan-api": {
"port": 21004,
},
"fangzhou-api": {
"port": 21005,
},
"qwen-api": {
"port": 21006,
},
"baichuan-api": {
"port": 21007,
},
"azure-api": {
"port": 21008,
},
}
# fastchat multi model worker server
FSCHAT_MULTI_MODEL_WORKERS = {
# TODO:
}
# fastchat controller server
FSCHAT_CONTROLLER = {
"host": DEFAULT_BIND_HOST,
"port": 20001,
"dispatch_method": "shortest_queue",
}

135
configs/server_config.py.example Executable file
View File

@ -0,0 +1,135 @@
import sys
from configs.model_config import LLM_DEVICE
# httpx 请求默认超时时间(秒)。如果加载模型或对话较慢,出现超时错误,可以适当加大该值。
HTTPX_DEFAULT_TIMEOUT = 300.0
# API 是否开启跨域默认为False如果需要开启请设置为True
# is open cross domain
OPEN_CROSS_DOMAIN = False
# 各服务器默认绑定host。如改为"0.0.0.0"需要修改下方所有XX_SERVER的host
DEFAULT_BIND_HOST = "0.0.0.0" if sys.platform != "win32" else "127.0.0.1"
# webui.py server
WEBUI_SERVER = {
"host": DEFAULT_BIND_HOST,
"port": 8501,
}
# api.py server
API_SERVER = {
"host": DEFAULT_BIND_HOST,
"port": 7861,
}
# fastchat openai_api server
FSCHAT_OPENAI_API = {
"host": DEFAULT_BIND_HOST,
"port": 20000,
}
# fastchat model_worker server
# 这些模型必须是在model_config.MODEL_PATH或ONLINE_MODEL中正确配置的。
# 在启动startup.py时可用通过`--model-name xxxx yyyy`指定模型不指定则为LLM_MODELS
FSCHAT_MODEL_WORKERS = {
# 所有模型共用的默认配置,可在模型专项配置中进行覆盖。
"default": {
"host": DEFAULT_BIND_HOST,
"port": 20002,
"device": LLM_DEVICE,
# False,'vllm',使用的推理加速框架,使用vllm如果出现HuggingFace通信问题参见doc/FAQ
# vllm对一些模型支持还不成熟暂时默认关闭
"infer_turbo": False,
# model_worker多卡加载需要配置的参数
# "gpus": None, # 使用的GPU以str的格式指定如"0,1"如失效请使用CUDA_VISIBLE_DEVICES="0,1"等形式指定
# "num_gpus": 1, # 使用GPU的数量
# "max_gpu_memory": "20GiB", # 每个GPU占用的最大显存
# 以下为model_worker非常用参数可根据需要配置
# "load_8bit": False, # 开启8bit量化
# "cpu_offloading": None,
# "gptq_ckpt": None,
# "gptq_wbits": 16,
# "gptq_groupsize": -1,
# "gptq_act_order": False,
# "awq_ckpt": None,
# "awq_wbits": 16,
# "awq_groupsize": -1,
# "model_names": LLM_MODELS,
# "conv_template": None,
# "limit_worker_concurrency": 5,
# "stream_interval": 2,
# "no_register": False,
# "embed_in_truncate": False,
# 以下为vllm_woker配置参数,注意使用vllm必须有gpu仅在Linux测试通过
# tokenizer = model_path # 如果tokenizer与model_path不一致在此处添加
# 'tokenizer_mode':'auto',
# 'trust_remote_code':True,
# 'download_dir':None,
# 'load_format':'auto',
# 'dtype':'auto',
# 'seed':0,
# 'worker_use_ray':False,
# 'pipeline_parallel_size':1,
# 'tensor_parallel_size':1,
# 'block_size':16,
# 'swap_space':4 , # GiB
# 'gpu_memory_utilization':0.90,
# 'max_num_batched_tokens':2560,
# 'max_num_seqs':256,
# 'disable_log_stats':False,
# 'conv_template':None,
# 'limit_worker_concurrency':5,
# 'no_register':False,
# 'num_gpus': 1
# 'engine_use_ray': False,
# 'disable_log_requests': False
},
# 可以如下示例方式更改默认配置
# "baichuan-7b": { # 使用default中的IP和端口
# "device": "cpu",
# },
#以下配置可以不用修改在model_config中设置启动的模型
"zhipu-api": {
"port": 21001,
},
"minimax-api": {
"port": 21002,
},
"xinghuo-api": {
"port": 21003,
},
"qianfan-api": {
"port": 21004,
},
"fangzhou-api": {
"port": 21005,
},
"qwen-api": {
"port": 21006,
},
"baichuan-api": {
"port": 21007,
},
"azure-api": {
"port": 21008,
},
}
# fastchat multi model worker server
FSCHAT_MULTI_MODEL_WORKERS = {
# TODO:
}
# fastchat controller server
FSCHAT_CONTROLLER = {
"host": DEFAULT_BIND_HOST,
"port": 20001,
"dispatch_method": "shortest_queue",
}

View File

@ -0,0 +1,80 @@
## 指定制定列的csv文件加载器
from langchain.document_loaders import CSVLoader
import csv
from io import TextIOWrapper
from typing import Dict, List, Optional
from langchain.docstore.document import Document
from langchain.document_loaders.helpers import detect_file_encodings
class FilteredCSVLoader(CSVLoader):
def __init__(
self,
file_path: str,
columns_to_read: List[str],
source_column: Optional[str] = None,
metadata_columns: List[str] = [],
csv_args: Optional[Dict] = None,
encoding: Optional[str] = None,
autodetect_encoding: bool = False,
):
super().__init__(
file_path=file_path,
source_column=source_column,
metadata_columns=metadata_columns,
csv_args=csv_args,
encoding=encoding,
autodetect_encoding=autodetect_encoding,
)
self.columns_to_read = columns_to_read
def load(self) -> List[Document]:
"""Load data into document objects."""
docs = []
try:
with open(self.file_path, newline="", encoding=self.encoding) as csvfile:
docs = self.__read_file(csvfile)
except UnicodeDecodeError as e:
if self.autodetect_encoding:
detected_encodings = detect_file_encodings(self.file_path)
for encoding in detected_encodings:
try:
with open(
self.file_path, newline="", encoding=encoding.encoding
) as csvfile:
docs = self.__read_file(csvfile)
break
except UnicodeDecodeError:
continue
else:
raise RuntimeError(f"Error loading {self.file_path}") from e
except Exception as e:
raise RuntimeError(f"Error loading {self.file_path}") from e
return docs
def __read_file(self, csvfile: TextIOWrapper) -> List[Document]:
docs = []
csv_reader = csv.DictReader(csvfile, **self.csv_args) # type: ignore
for i, row in enumerate(csv_reader):
if self.columns_to_read[0] in row:
content = row[self.columns_to_read[0]]
# Extract the source if available
source = (
row.get(self.source_column, None)
if self.source_column is not None
else self.file_path
)
metadata = {"source": source, "row": i}
for col in self.metadata_columns:
if col in row:
metadata[col] = row[col]
doc = Document(page_content=content, metadata=metadata)
docs.append(doc)
else:
raise ValueError(f"Column '{self.columns_to_read[0]}' not found in CSV file.")
return docs

2
document_loaders/__init__.py Executable file
View File

@ -0,0 +1,2 @@
from .mypdfloader import RapidOCRPDFLoader
from .myimgloader import RapidOCRLoader

25
document_loaders/myimgloader.py Executable file
View File

@ -0,0 +1,25 @@
from typing import List
from langchain.document_loaders.unstructured import UnstructuredFileLoader
class RapidOCRLoader(UnstructuredFileLoader):
def _get_elements(self) -> List:
def img2text(filepath):
from rapidocr_onnxruntime import RapidOCR
resp = ""
ocr = RapidOCR()
result, _ = ocr(filepath)
if result:
ocr_result = [line[1] for line in result]
resp += "\n".join(ocr_result)
return resp
text = img2text(self.file_path)
from unstructured.partition.text import partition_text
return partition_text(text=text, **self.unstructured_kwargs)
if __name__ == "__main__":
loader = RapidOCRLoader(file_path="../tests/samples/ocr_test.jpg")
docs = loader.load()
print(docs)

48
document_loaders/mypdfloader.py Executable file
View File

@ -0,0 +1,48 @@
from typing import List
from langchain.document_loaders.unstructured import UnstructuredFileLoader
import tqdm
class RapidOCRPDFLoader(UnstructuredFileLoader):
def _get_elements(self) -> List:
def pdf2text(filepath):
import fitz # pyMuPDF里面的fitz包不要与pip install fitz混淆
from rapidocr_onnxruntime import RapidOCR
import numpy as np
ocr = RapidOCR()
doc = fitz.open(filepath)
resp = ""
b_unit = tqdm.tqdm(total=doc.page_count, desc="RapidOCRPDFLoader context page index: 0")
for i, page in enumerate(doc):
# 更新描述
b_unit.set_description("RapidOCRPDFLoader context page index: {}".format(i))
# 立即显示进度条更新结果
b_unit.refresh()
# TODO: 依据文本与图片顺序调整处理方式
text = page.get_text("")
resp += text + "\n"
img_list = page.get_images()
for img in img_list:
pix = fitz.Pixmap(doc, img[0])
img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, -1)
result, _ = ocr(img_array)
if result:
ocr_result = [line[1] for line in result]
resp += "\n".join(ocr_result)
# 更新进度
b_unit.update(1)
return resp
text = pdf2text(self.file_path)
from unstructured.partition.text import partition_text
return partition_text(text=text, **self.unstructured_kwargs)
if __name__ == "__main__":
loader = RapidOCRPDFLoader(file_path="../tests/samples/ocr_test.pdf")
docs = loader.load()
print(docs)

121
init_database.py Executable file
View File

@ -0,0 +1,121 @@
import sys
sys.path.append(".")
from server.knowledge_base.migrate import (create_tables, reset_tables, import_from_db,
folder2db, prune_db_docs, prune_folder_files)
from configs.model_config import NLTK_DATA_PATH, EMBEDDING_MODEL
import nltk
nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path
from datetime import datetime
import sys
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="please specify only one operate method once time.")
parser.add_argument(
"-r",
"--recreate-vs",
action="store_true",
help=('''
recreate vector store.
use this option if you have copied document files to the content folder, but vector store has not been populated or DEFAUL_VS_TYPE/EMBEDDING_MODEL changed.
'''
)
)
parser.add_argument(
"--create-tables",
action="store_true",
help=("create empty tables if not existed")
)
parser.add_argument(
"--clear-tables",
action="store_true",
help=("create empty tables, or drop the database tables before recreate vector stores")
)
parser.add_argument(
"--import-db",
help="import tables from specified sqlite database"
)
parser.add_argument(
"-u",
"--update-in-db",
action="store_true",
help=('''
update vector store for files exist in database.
use this option if you want to recreate vectors for files exist in db and skip files exist in local folder only.
'''
)
)
parser.add_argument(
"-i",
"--increament",
action="store_true",
help=('''
update vector store for files exist in local folder and not exist in database.
use this option if you want to create vectors increamentally.
'''
)
)
parser.add_argument(
"--prune-db",
action="store_true",
help=('''
delete docs in database that not existed in local folder.
it is used to delete database docs after user deleted some doc files in file browser
'''
)
)
parser.add_argument(
"--prune-folder",
action="store_true",
help=('''
delete doc files in local folder that not existed in database.
is is used to free local disk space by delete unused doc files.
'''
)
)
parser.add_argument(
"-n",
"--kb-name",
type=str,
nargs="+",
default=[],
help=("specify knowledge base names to operate on. default is all folders exist in KB_ROOT_PATH.")
)
parser.add_argument(
"-e",
"--embed-model",
type=str,
default=EMBEDDING_MODEL,
help=("specify embeddings model.")
)
args = parser.parse_args()
start_time = datetime.now()
if args.create_tables:
create_tables() # confirm tables exist
if args.clear_tables:
reset_tables()
print("database talbes reseted")
if args.recreate_vs:
create_tables()
print("recreating all vector stores")
folder2db(kb_names=args.kb_name, mode="recreate_vs", embed_model=args.embed_model)
elif args.import_db:
import_from_db(args.import_db)
elif args.update_in_db:
folder2db(kb_names=args.kb_name, mode="update_in_db", embed_model=args.embed_model)
elif args.increament:
folder2db(kb_names=args.kb_name, mode="increament", embed_model=args.embed_model)
elif args.prune_db:
prune_db_docs(args.kb_name)
elif args.prune_folder:
prune_folder_files(args.kb_name)
end_time = datetime.now()
print(f"总计用时: {end_time-start_time}")

BIN
knowledge_base/info.db Normal file

Binary file not shown.

View File

@ -0,0 +1,2 @@
你是谁?你的名字?你叫什么?你是什么?你是?
您好,我是里海科技研发的农业智能大模型,我的名字叫里海科技农业智能顾问。我可以和人类进行自然交流,解答农业相关问题。

174
knownledge_api.py Executable file
View File

@ -0,0 +1,174 @@
from typing import *
import nltk
import sys
import os
import pydantic
from pydantic import BaseModel
sys.path.append(os.path.dirname(os.path.dirname(__file__)))
from configs import VERSION
from configs.model_config import NLTK_DATA_PATH
from configs.server_config import OPEN_CROSS_DOMAIN
import argparse
import uvicorn
from fastapi.middleware.cors import CORSMiddleware
from starlette.responses import RedirectResponse
from fastapi import FastAPI
nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path
class BaseResponse(BaseModel):
code: int = pydantic.Field(200, description="API status code")
msg: str = pydantic.Field("success", description="API status message")
data: Any = pydantic.Field(None, description="API data")
class Config:
schema_extra = {
"example": {
"code": 200,
"msg": "success",
}
}
class ListResponse(BaseResponse):
data: List[str] = pydantic.Field(..., description="List of names")
class Config:
schema_extra = {
"example": {
"code": 200,
"msg": "success",
"data": ["doc1.docx", "doc2.pdf", "doc3.txt"],
}
}
async def document():
return RedirectResponse(url="/docs")
def create_app(run_mode: str = None):
app = FastAPI(
title="Langchain-Chatchat API Server",
version=VERSION
)
# Add CORS middleware to allow all origins
# 在config.py中设置OPEN_DOMAIN=True允许跨域
# set OPEN_DOMAIN=True in config.py to allow cross-domain
if OPEN_CROSS_DOMAIN:
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
return app
def mount_knowledge_routes(app: FastAPI):
from server.knowledge_base.kb_api import list_kbs, create_kb, delete_kb
from server.knowledge_base.kb_doc_api import (list_files, upload_docs, delete_docs,
update_docs, download_doc, recreate_vector_store,
search_docs, DocumentWithScore, update_info)
# Tag: Knowledge Base Management
app.get("/knowledge_base/list_knowledge_bases",
tags=["Knowledge Base Management"],
response_model=ListResponse,
summary="获取知识库列表")(list_kbs)
app.post("/knowledge_base/create_knowledge_base",
tags=["Knowledge Base Management"],
response_model=BaseResponse,
summary="创建知识库"
)(create_kb)
app.post("/knowledge_base/delete_knowledge_base",
tags=["Knowledge Base Management"],
response_model=BaseResponse,
summary="删除知识库"
)(delete_kb)
app.get("/knowledge_base/list_files",
tags=["Knowledge Base Management"],
response_model=ListResponse,
summary="获取知识库内的文件列表"
)(list_files)
app.post("/knowledge_base/search_docs",
tags=["Knowledge Base Management"],
response_model=List[DocumentWithScore],
summary="搜索知识库"
)(search_docs)
app.post("/knowledge_base/upload_docs",
tags=["Knowledge Base Management"],
response_model=BaseResponse,
summary="上传文件到知识库,并/或进行向量化"
)(upload_docs)
app.post("/knowledge_base/delete_docs",
tags=["Knowledge Base Management"],
response_model=BaseResponse,
summary="删除知识库内指定文件"
)(delete_docs)
app.post("/knowledge_base/update_info",
tags=["Knowledge Base Management"],
response_model=BaseResponse,
summary="更新知识库介绍"
)(update_info)
app.post("/knowledge_base/update_docs",
tags=["Knowledge Base Management"],
response_model=BaseResponse,
summary="更新现有文件到知识库"
)(update_docs)
app.get("/knowledge_base/download_doc",
tags=["Knowledge Base Management"],
summary="下载对应的知识文件")(download_doc)
app.post("/knowledge_base/recreate_vector_store",
tags=["Knowledge Base Management"],
summary="根据content中文档重建向量库流式输出处理进度。"
)(recreate_vector_store)
def run_api(host, port, **kwargs):
if kwargs.get("ssl_keyfile") and kwargs.get("ssl_certfile"):
uvicorn.run(app,
host=host,
port=port,
ssl_keyfile=kwargs.get("ssl_keyfile"),
ssl_certfile=kwargs.get("ssl_certfile"),
)
else:
uvicorn.run(app, host=host, port=port)
if __name__ == "__main__":
parser = argparse.ArgumentParser(prog='langchain-ChatGLM',
description='About langchain-ChatGLM, local knowledge based ChatGLM with langchain'
' 基于本地知识库的 ChatGLM 问答')
parser.add_argument("--host", type=str, default="0.0.0.0")
parser.add_argument("--port", type=int, default=7861)
parser.add_argument("--ssl_keyfile", type=str)
parser.add_argument("--ssl_certfile", type=str)
# 初始化消息
args = parser.parse_args()
args_dict = vars(args)
app = create_app()
mount_knowledge_routes(app)
run_api(host=args.host,
port=args.port,
ssl_keyfile=args.ssl_keyfile,
ssl_certfile=args.ssl_certfile,
)

View File

@ -0,0 +1,76 @@
The Carnegie Mellon Pronouncing Dictionary [cmudict.0.7a]
ftp://ftp.cs.cmu.edu/project/speech/dict/
https://cmusphinx.svn.sourceforge.net/svnroot/cmusphinx/trunk/cmudict/cmudict.0.7a
Copyright (C) 1993-2008 Carnegie Mellon University. All rights reserved.
File Format: Each line consists of an uppercased word,
a counter (for alternative pronunciations), and a transcription.
Vowels are marked for stress (1=primary, 2=secondary, 0=no stress).
E.g.: NATURAL 1 N AE1 CH ER0 AH0 L
The dictionary contains 127069 entries. Of these, 119400 words are assigned
a unique pronunciation, 6830 words have two pronunciations, and 839 words have
three or more pronunciations. Many of these are fast-speech variants.
Phonemes: There are 39 phonemes, as shown below:
Phoneme Example Translation Phoneme Example Translation
------- ------- ----------- ------- ------- -----------
AA odd AA D AE at AE T
AH hut HH AH T AO ought AO T
AW cow K AW AY hide HH AY D
B be B IY CH cheese CH IY Z
D dee D IY DH thee DH IY
EH Ed EH D ER hurt HH ER T
EY ate EY T F fee F IY
G green G R IY N HH he HH IY
IH it IH T IY eat IY T
JH gee JH IY K key K IY
L lee L IY M me M IY
N knee N IY NG ping P IH NG
OW oat OW T OY toy T OY
P pee P IY R read R IY D
S sea S IY SH she SH IY
T tea T IY TH theta TH EY T AH
UH hood HH UH D UW two T UW
V vee V IY W we W IY
Y yield Y IY L D Z zee Z IY
ZH seizure S IY ZH ER
(For NLTK, entries have been sorted so that, e.g. FIRE 1 and FIRE 2
are contiguous, and not separated by FIRE'S 1.)
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
The contents of this file are deemed to be source code.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
This work was supported in part by funding from the Defense Advanced
Research Projects Agency, the Office of Naval Research and the National
Science Foundation of the United States of America, and by member
companies of the Carnegie Mellon Sphinx Speech Consortium. We acknowledge
the contributions of many volunteers to the expansion and improvement of
this dictionary.
THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

133737
nltk_data/corpora/cmudict/cmudict Executable file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,98 @@
Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
been contributed by various people using NLTK for sentence boundary detection.
For information about how to use these models, please confer the tokenization HOWTO:
http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
and chapter 3.8 of the NLTK book:
http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
There are pretrained tokenizers for the following languages:
File Language Source Contents Size of training corpus(in tokens) Model contributed by
=======================================================================================================================================================================
czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
Literarni Noviny
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
(Berlingske Avisdata, Copenhagen) Weekend Avisen
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
(American)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
Text Bank (Suomen Kielen newspapers
Tekstipankki)
Finnish Center for IT Science
(CSC)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
(European)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
(Switzerland) CD-ROM
(Uses "ss"
instead of "ß")
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
(Bokmål and Information Technologies,
Nynorsk) Bergen
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
(http://www.nkjp.pl/)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
(Brazilian) (Linguateca)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
Slovene Academy for Arts
and Sciences
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
(European)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
(and some other texts)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
(Türkçe Derlem Projesi)
University of Ankara
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
Unicode using the codecs module.
Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
Computational Linguistics 32: 485-525.
---- Training Code ----
# import punkt
import nltk.tokenize.punkt
# Make a new Tokenizer
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
# Read in training corpus (one example: Slovene)
import codecs
text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
# Train tokenizer
tokenizer.train(text)
# Dump pickled tokenizer
import pickle
out = open("slovene.pickle","wb")
pickle.dump(tokenizer, out)
out.close()
---------

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,98 @@
Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
been contributed by various people using NLTK for sentence boundary detection.
For information about how to use these models, please confer the tokenization HOWTO:
http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
and chapter 3.8 of the NLTK book:
http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
There are pretrained tokenizers for the following languages:
File Language Source Contents Size of training corpus(in tokens) Model contributed by
=======================================================================================================================================================================
czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
Literarni Noviny
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
(Berlingske Avisdata, Copenhagen) Weekend Avisen
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
(American)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
Text Bank (Suomen Kielen newspapers
Tekstipankki)
Finnish Center for IT Science
(CSC)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
(European)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
(Switzerland) CD-ROM
(Uses "ss"
instead of "ß")
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
(Bokmål and Information Technologies,
Nynorsk) Bergen
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
(http://www.nkjp.pl/)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
(Brazilian) (Linguateca)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
Slovene Academy for Arts
and Sciences
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
(European)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
(and some other texts)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
(Türkçe Derlem Projesi)
University of Ankara
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
Unicode using the codecs module.
Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
Computational Linguistics 32: 485-525.
---- Training Code ----
# import punkt
import nltk.tokenize.punkt
# Make a new Tokenizer
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
# Read in training corpus (one example: Slovene)
import codecs
text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
# Train tokenizer
tokenizer.train(text)
# Dump pickled tokenizer
import pickle
out = open("slovene.pickle","wb")
pickle.dump(tokenizer, out)
out.close()
---------

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Binary file not shown.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Binary file not shown.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

72
requirements.txt Executable file
View File

@ -0,0 +1,72 @@
# API requirements
langchain>=0.0.329 # suggest to use latest Langchain
langchain-experimental>=0.0.30
fschat[model_worker]==0.2.32
xformers>=0.0.22.post4
openai>=0.28.1
sentence_transformers
transformers>=4.34
torch>=2.0.1 # suggest version 2.1
torchvision
torchaudio
fastapi>=0.104
nltk>=3.8.1
uvicorn~=0.23.1
starlette~=0.27.0
pydantic~=1.10.11
unstructured[all-docs]>=0.10.12
python-magic-bin; sys_platform == 'win32'
SQLAlchemy==2.0.19
faiss-cpu
accelerate
spacy
PyMuPDF
rapidocr_onnxruntime
requests~=2.31.0
pathlib~=1.0.1
pytest~=7.4.3
numexpr~=2.8.7
strsimpy~=0.2.1
markdownify~=0.11.6
tiktoken
tqdm~=4.66.1
websockets~=11.0.3
numpy~=1.24.4
pandas~=2.0.3
einops
transformers_stream_generator==0.0.4
vllm>=0.2.0; sys_platform == "linux"
# online api libs
# zhipuai
# dashscope>=1.10.0 # qwen
# qianfan
# volcengine>=1.0.106 # fangzhou
# uncomment libs if you want to use corresponding vector store
# pymilvus==2.1.3 # requires milvus==2.1.3
# psycopg2
# pgvector
# WebUI requirements
streamlit~=1.27.0
streamlit-option-menu>=0.3.6
streamlit-antd-components>=0.1.11
streamlit-chatbox>=1.1.11
streamlit-aggrid>=0.3.4.post3
httpx[brotli,http2,socks]>=0.25.0
watchdog
sentencepiece~=0.1.99
cachetools~=5.3.2
chardet~=5.2.0
python-dateutil~=2.8.2
safetensors~=0.4.0
readline~=8.2
colorama~=0.4.6
loguru~=0.7.2
pyyaml~=6.0.1

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

0
server/db/__init__.py Executable file
View File

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

16
server/db/base.py Executable file
View File

@ -0,0 +1,16 @@
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base, DeclarativeMeta
from sqlalchemy.orm import sessionmaker
from configs import SQLALCHEMY_DATABASE_URI
import json
engine = create_engine(
SQLALCHEMY_DATABASE_URI,
json_serializer=lambda obj: json.dumps(obj, ensure_ascii=False),
)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
Base: DeclarativeMeta = declarative_base()

0
server/db/models/__init__.py Executable file
View File

Binary file not shown.

Binary file not shown.

Some files were not shown because too many files have changed in this diff Show More