From f5d3229b7cd0a735416c3dedf9336ee0d8b095da Mon Sep 17 00:00:00 2001 From: Anthony Axenov Date: Fri, 23 Jan 2026 14:05:19 +0800 Subject: [PATCH] WIP --- compose.yml | 42 +++++++------- rag/README.md | 4 +- rag/rag.py | 156 ++++++++++++++++++++++---------------------------- 3 files changed, 92 insertions(+), 110 deletions(-) diff --git a/compose.yml b/compose.yml index 352e0a3..648e5a8 100644 --- a/compose.yml +++ b/compose.yml @@ -9,25 +9,25 @@ services: - "${OLLAMA_PORT:-11434}:11434" restart: "no" - ai-qdrant: - container_name: ai-qdrant - image: qdrant/qdrant - env_file: .env - ports: - - "${QDRANT_PORT:-6333}:6333" - volumes: - - ./.data/qdrant/storage:/qdrant/storage - restart: "no" - profiles: ["rag"] + # ai-qdrant: + # container_name: ai-qdrant + # image: qdrant/qdrant + # env_file: .env + # ports: + # - "${QDRANT_PORT:-6333}:6333" + # volumes: + # - ./.data/qdrant/storage:/qdrant/storage + # restart: "no" + # profiles: ["rag"] - ai-webui: - container_name: ai-webui - image: ghcr.io/open-webui/open-webui:main - env_file: .env - volumes: - - ./.data/webui:/app/backend/data - ports: - - "${OWEBUI_PORT:-9999}:8080" - extra_hosts: - - "host.docker.internal:host-gateway" - restart: "no" + # ai-webui: + # container_name: ai-webui + # image: ghcr.io/open-webui/open-webui:main + # env_file: .env + # volumes: + # - ./.data/webui:/app/backend/data + # ports: + # - "${OWEBUI_PORT:-9999}:8080" + # extra_hosts: + # - "host.docker.internal:host-gateway" + # restart: "no" diff --git a/rag/README.md b/rag/README.md index 3b48629..a539f46 100644 --- a/rag/README.md +++ b/rag/README.md @@ -8,7 +8,7 @@ cd ..; ./up; cd - python3 -m venv .venv source .venv/bin/activate -pip install beautifulsoup4 markdownify sentence-transformers qdrant-client langchain transformers +pip install beautifulsoup4 markdownify sentence-transformers qdrant-client langchain transformers ollama pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu ./download.sh 123456789 # <<== pageId страницы в Confluence python3 convert.py @@ -66,7 +66,7 @@ rag/ ```bash python3 -m venv .venv source ./venv/bin/activate -pip install beautifulsoup4 markdownify sentence-transformers qdrant-client langchain transformers +pip install beautifulsoup4 markdownify sentence-transformers qdrant-client langchain transformers ollama pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu ``` diff --git a/rag/rag.py b/rag/rag.py index b8510eb..e428536 100644 --- a/rag/rag.py +++ b/rag/rag.py @@ -1,10 +1,9 @@ import os -import requests -import json import time import sys from qdrant_client import QdrantClient from sentence_transformers import SentenceTransformer, CrossEncoder +import ollama DEFAULT_CHAT_MODEL = "openchat:7b" DEFAULT_EMBED_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" @@ -38,33 +37,26 @@ class RagSystem: self.qdrant_port = qdrant_port self.chat_model = chat_model self.emb_model = SentenceTransformer(embed_model) - self.qdrant = QdrantClient(host=args.qdrant_host, port=args.qdrant_port) + self.qdrant = QdrantClient(host=qdrant_host, port=qdrant_port) self.use_rank = use_rank if self.use_rank: self.rank_model = CrossEncoder(rank_model) self.conversation_history = [] + self.ollama = ollama.Client(base_url=ollama_url) def check_chat_model(self): - response = requests.get(f"{self.ollama_url}/api/tags") - if response.status_code != 200: - return False - for model in response.json().get("models", []): - if model["name"] == self.chat_model: - return True - return False + models = self.ollama.list() + return any(model.name == self.chat_model for model in models) def install_chat_model(self, model: str = DEFAULT_CHAT_MODEL): try: - response = requests.post(f"{self.ollama_url}/api/pull", json={"model": model}) - if response.status_code == 200: - print(f"Модель {self.chat_model} установлена успешно") - else: - print(f"Ошибка установки модели: {response.text}") + result = self.ollama.pull(model) + print(f"Модель {model} установлена успешно") except Exception as e: - print(f"Ошибка проверки модели: {str(e)}") + print(f"Ошибка установки модели: {str(e)}") def load_chat_model(self): - requests.post(f"{self.ollama_url}/api/generate", json={"model": self.chat_model}, timeout=600) + self.ollama.generate(model=self.chat_model, keep_alive=True) def search_qdrant(self, query: str, doc_count: int = DEFAULT_TOP_K, collection_name = DEFAULT_QDRANT_COLLECTION): query_vec = self.emb_model.encode(query, show_progress_bar=False).tolist() @@ -100,85 +92,71 @@ class RagSystem: return ranked_docs[:top_n] def generate_answer(self, sys_prompt: str, user_prompt: str): - url = f"{self.ollama_url}/api/generate" - body = { - "model": self.chat_model, - "system": sys_prompt, - "prompt": user_prompt, - "stream": False, - "options": { - "temperature": 0.5, - # "top_p": 0.2, - }, - } - - response = requests.post(url, json=body, timeout=900) - if response.status_code != 200: - return f"Ошибка генерации ответа: {response.status_code} {response.text}" - self.response = response.json() - return self.response["response"] + try: + with self.ollama.generate( + model=self.chat_model, + prompt=sys_prompt + "\n" + user_prompt, + options={ + "temperature": 0.5, + }, + stream=False, + ) as generator: + response = next(generator) + if response.error: + raise RuntimeError(f"Ошибка генерации: {response.error}") + self.last_response = response + return response.output + except Exception as e: + print(f"Ошибка генерации ответа: {str(e)}") + return str(e) def generate_answer_stream(self, sys_prompt: str, user_prompt: str): - url = f"{self.ollama_url}/api/generate" - body = { - "model": self.chat_model, - "system": sys_prompt, - "prompt": user_prompt, - "stream": True, - "options": { - "temperature": 0.5, - # "top_p": 0.2, - }, - } - resp = requests.post(url, json=body, stream=True, timeout=900) - if resp.status_code != 200: - raise RuntimeError(f"Ошибка генерации ответа: {resp.status_code} {resp.text}") - - answer = "" - self.response = None - for chunk in resp.iter_lines(): - if chunk: - try: - decoded_chunk = chunk.decode('utf-8') - data = json.loads(decoded_chunk) - if "response" in data: - yield data["response"] - answer += data["response"] - if "done" in data and data["done"] is True: - self.response = data - break - elif "error" in data: - answer += f" | Ошибка стриминга ответа: {data['error']}" - break - except json.JSONDecodeError as e: - answer += f" | Ошибка конвертации чанка: {chunk.decode('utf-8')} - {e}" - except Exception as e: - answer += f" | Ошибка обработки чанка: {e}" + try: + generator = self.ollama.generate( + model=self.chat_model, + prompt=sys_prompt + "\n" + user_prompt, + options={ + "temperature": 0.5, + }, + stream=True, + ) + answer = "" + for response in generator: + if response.data: + yield response.data + answer += response.data + if response.done: + self.last_response = response + break + return answer + except Exception as e: + print(f"Ошибка стриминга: {str(e)}") + return str(e) def get_prompt_eval_count(self): - if not self.response: + if not hasattr(self, "last_response"): return 0 - return self.response["prompt_eval_count"] + return self.last_response.prompt_eval_count or 0 def get_prompt_eval_duration(self): - if not self.response: + if not hasattr(self, "last_response"): return 0 - return self.response["prompt_eval_duration"] / (10 ** 9) + return self.last_response.prompt_eval_duration / (10 ** 9) def get_eval_count(self): - if not self.response: + if not hasattr(self, "last_response"): return 0 - return self.response["eval_count"] + return self.last_response.eval_count or 0 def get_eval_duration(self): - if not self.response: + if not hasattr(self, "last_response"): return 0 - return self.response["eval_duration"] / (10 ** 9) + return self.last_response.eval_duration / (10 ** 9) def get_total_duration(self): - if not self.response: + if not hasattr(self, "last_response"): return 0 - return self.response["total_duration"] / (10 ** 9) + return self.last_response.total_duration / (10 ** 9) def get_tps(self): eval_count = self.get_eval_count() @@ -360,19 +338,23 @@ Context: def process_query(self, sys_prompt: str, user_prompt: str, streaming: bool = DEFAULT_STREAM): answer = "" - # try: if streaming: self.print_v(text="\nГенерация потокового ответа (^C для остановки)...\n") print(f"<<< ", end='', flush=True) - for token in self.rag.generate_answer_stream(sys_prompt, user_prompt): - answer += token - print(token, end='', flush=True) + try: + for token in self.rag.generate_answer_stream(sys_prompt, user_prompt): + answer += token + print(token, end='', flush=True) + except KeyboardInterrupt: + print("\n*** Генерация ответа прервана") + return answer else: self.print_v(text="\nГенерация ответа (^C для остановки)...\n") - answer = self.rag.generate_answer(sys_prompt, user_prompt) - print(f"<<< {answer}\n") - # except RuntimeError as e: - # answer = str(e) + try: + answer = self.rag.generate_answer(sys_prompt, user_prompt) + except KeyboardInterrupt: + print("\n*** Генерация ответа прервана") + return "" print(f"\n===================================================") return answer