1
0

Доработка rag, удаление скриптов моделей, актуализация README

This commit is contained in:
2025-08-31 00:51:42 +08:00
parent c408972b45
commit defc30cad0
108 changed files with 635 additions and 745 deletions

View File

@@ -4,6 +4,7 @@ from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.http import models
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import MarkdownHeaderTextSplitter
DEFAULT_INPUT_DIR="data"
DEFAULT_CHUNK_SIZE=500
@@ -59,24 +60,45 @@ def load_markdown_files(input_dir):
return documents
def chunk_text(texts, chunk_size, chunk_overlap):
splitter = RecursiveCharacterTextSplitter(
markdown_splitter = MarkdownHeaderTextSplitter(
headers_to_split_on=[
("#", "Header 1"),
("##", "Header 2"),
("###", "Header 3"),
],
strip_headers=False,
return_each_line=False,
)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
add_start_index=True,
length_function=len,
separators=["\n\n", "\n", " ", ""]
)
chunks = []
for doc in texts:
doc_chunks = splitter.split_text(doc["text"])
for i, chunk in enumerate(doc_chunks):
chunk_id = f"{doc['id']}_chunk{i}"
chunk_dict = {"id": chunk_id, "text": chunk}
md_header_splits = markdown_splitter.split_text(doc["text"])
# Перенос всех доступных метаданных
for key in ["url", "version", "author", "date"]:
if key in doc and doc[key] is not None:
chunk_dict[key] = doc[key]
chunks.append(chunk_dict)
for md_split in md_header_splits:
# RecursiveCharacterTextSplitter for each markdown split
split_docs = text_splitter.split_documents([md_split])
for i, chunk in enumerate(split_docs):
chunk_id = f"{doc['id']}_chunk{i}"
chunk_dict = {"id": chunk_id, "text": chunk.page_content}
# Перенос всех доступных метаданных, включая метаданные из MarkdownHeaderTextSplitter
for key in ["url", "version", "author", "date"]:
if key in doc and doc[key] is not None:
chunk_dict[key] = doc[key]
# Добавление метаданных из MarkdownHeaderTextSplitter
for key, value in chunk.metadata.items():
chunk_dict[key] = value
chunks.append(chunk_dict)
return chunks
def embed_and_upload(chunks, embedding_model_name, qdrant_host="localhost", qdrant_port=6333, qdrant_collection="rag"):
@@ -149,3 +171,4 @@ if __name__ == "__main__":
args.qdrant_port,
args.qdrant_collection
)