Доработка rag, удаление скриптов моделей, актуализация README
This commit is contained in:
@@ -4,6 +4,7 @@ from sentence_transformers import SentenceTransformer
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.http import models
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from langchain.text_splitter import MarkdownHeaderTextSplitter
|
||||
|
||||
DEFAULT_INPUT_DIR="data"
|
||||
DEFAULT_CHUNK_SIZE=500
|
||||
@@ -59,24 +60,45 @@ def load_markdown_files(input_dir):
|
||||
return documents
|
||||
|
||||
def chunk_text(texts, chunk_size, chunk_overlap):
|
||||
splitter = RecursiveCharacterTextSplitter(
|
||||
markdown_splitter = MarkdownHeaderTextSplitter(
|
||||
headers_to_split_on=[
|
||||
("#", "Header 1"),
|
||||
("##", "Header 2"),
|
||||
("###", "Header 3"),
|
||||
],
|
||||
strip_headers=False,
|
||||
return_each_line=False,
|
||||
)
|
||||
text_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
add_start_index=True,
|
||||
length_function=len,
|
||||
separators=["\n\n", "\n", " ", ""]
|
||||
)
|
||||
|
||||
chunks = []
|
||||
for doc in texts:
|
||||
doc_chunks = splitter.split_text(doc["text"])
|
||||
for i, chunk in enumerate(doc_chunks):
|
||||
chunk_id = f"{doc['id']}_chunk{i}"
|
||||
chunk_dict = {"id": chunk_id, "text": chunk}
|
||||
md_header_splits = markdown_splitter.split_text(doc["text"])
|
||||
|
||||
# Перенос всех доступных метаданных
|
||||
for key in ["url", "version", "author", "date"]:
|
||||
if key in doc and doc[key] is not None:
|
||||
chunk_dict[key] = doc[key]
|
||||
chunks.append(chunk_dict)
|
||||
for md_split in md_header_splits:
|
||||
# RecursiveCharacterTextSplitter for each markdown split
|
||||
split_docs = text_splitter.split_documents([md_split])
|
||||
|
||||
for i, chunk in enumerate(split_docs):
|
||||
chunk_id = f"{doc['id']}_chunk{i}"
|
||||
chunk_dict = {"id": chunk_id, "text": chunk.page_content}
|
||||
|
||||
# Перенос всех доступных метаданных, включая метаданные из MarkdownHeaderTextSplitter
|
||||
for key in ["url", "version", "author", "date"]:
|
||||
if key in doc and doc[key] is not None:
|
||||
chunk_dict[key] = doc[key]
|
||||
|
||||
# Добавление метаданных из MarkdownHeaderTextSplitter
|
||||
for key, value in chunk.metadata.items():
|
||||
chunk_dict[key] = value
|
||||
|
||||
chunks.append(chunk_dict)
|
||||
return chunks
|
||||
|
||||
def embed_and_upload(chunks, embedding_model_name, qdrant_host="localhost", qdrant_port=6333, qdrant_collection="rag"):
|
||||
@@ -149,3 +171,4 @@ if __name__ == "__main__":
|
||||
args.qdrant_port,
|
||||
args.qdrant_collection
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user