diff --git a/.gitignore b/.gitignore
index 24fe2f7..96b1cb8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 /rag/input_html/*
 /rag/input_md/*
 /rag/sys_prompt.txt
+/rag/chats/*.md
 
 .old/
 .venv/
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..26d3352
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,3 @@
+# Default ignored files
+/shelf/
+/workspace.xml
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..0cb5d9c
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Black">
+    <option name="sdkName" value="Python 3.13" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.13 (ollama)" project-jdk-type="Python SDK" />
+</project>
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..556232f
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/ollama.iml" filepath="$PROJECT_DIR$/.idea/ollama.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/ollama.iml b/.idea/ollama.iml
new file mode 100644
index 0000000..9084195
--- /dev/null
+++ b/.idea/ollama.iml
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <excludeFolder url="file://$MODULE_DIR$/.venv" />
+    </content>
+    <orderEntry type="jdk" jdkName="Python 3.13 (ollama)" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="GOOGLE" />
+    <option name="myDocStringFormat" value="Google" />
+  </component>
+  <component name="TestRunnerService">
+    <option name="PROJECT_TEST_RUNNER" value="py.test" />
+  </component>
+</module>
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..35eb1dd
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/rag/README.md b/rag/README.md
index 4a9156d..03d78a2 100644
--- a/rag/README.md
+++ b/rag/README.md
@@ -38,8 +38,9 @@ RAG (Retrieval-Augmented Generation) — это архитектура, кото
 rag/
 ├── input_html/         # Входные файлы HTML, загруженные из Confluence
 ├── input_md/           # Входные (конвертированные) файлы Markdown
-├── download.sh    # Скрипт для загрузки страниц из Confluence
-├── convert.py       # Скрипт конвертации HTML в Markdown
+├── chats/              # Директория для сохранения чатов
+├── download.sh         # Скрипт для загрузки страниц из Confluence
+├── convert.py          # Скрипт конвертации HTML в Markdown
 ├── vectorize.py        # Скрипт векторизации Markdown
 ├── rag.py              # Основной скрипт RAG системы
 ├── clear.sh            # Скрипт очистки html/md файлов
diff --git a/rag/TODO.md b/rag/TODO.md
index 7603f05..48c2071 100644
--- a/rag/TODO.md
+++ b/rag/TODO.md
@@ -3,6 +3,7 @@
 * [ ] Описать подготовку знаний в Open WebUI
 * [ ] Обработка pdf, json, ...
 * [ ] Ранжировние результатов
+* [ ] Конвертирование таблиц в списки
 * [ ] Режим диалога (запоминание запросов и ответов)
 * [ ] API
 * [ ] Telegram-бот
diff --git a/rag/chats/.gitkeep b/rag/chats/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/rag/download.sh b/rag/download.sh
index d1036df..5d0fc3a 100755
--- a/rag/download.sh
+++ b/rag/download.sh
@@ -55,7 +55,7 @@ for PAGE_ID in "$@"; do
     CONTENT=${CONTENT//src=\"\//src=\"$CONF_URL}
 
     URL="$CONF_URL/pages/viewpage.action?pageId=$PAGE_ID"
-    echo -e "@@$URL@@\n<br><html><body>Исходная страница: <a href=$URL>$URL</a><br><br><h1>$TITLE</h1>$CONTENT</body></html>" > "$HTML_FILEPATH"
+    echo -e "@@$URL@@\n<br><html><body>Исходная страница: <a href=\"$URL\">$URL</a><br><br><h1>$TITLE</h1>$CONTENT</body></html>" > "$HTML_FILEPATH"
     echo "Сохранено: $OUTPUT_PATH/$TITLE.html"
 
     CHILD_IDS=$(echo "$RESPONSE" | jq -r '.children.page.results[]?.id' 2>/dev/null)
diff --git a/rag/rag.py b/rag/rag.py
index addf8a7..d7c7898 100644
--- a/rag/rag.py
+++ b/rag/rag.py
@@ -1,31 +1,34 @@
-import argparse
 import os
-import hashlib
 import requests
+import json
+import time
 from sentence_transformers import SentenceTransformer
 
-class LocalRAGSystem:
+class RagSystem:
     def __init__(self,
                  md_folder: str = "input_md",
                  ollama_url: str = "http://localhost:11434",
                  qdrant_host: str = "localhost",
                  qdrant_port: int = 6333,
                  embed_model: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
-                 chat_model: str = "qwen2.5:3b"):
+                 chat_model: str = "phi4-mini:3.8b"):
         self.md_folder = md_folder
         self.ollama_url = ollama_url
         self.qdrant_host = qdrant_host
         self.qdrant_port = qdrant_port
-        self.embed_model = embed_model
         self.chat_model = chat_model
         self.emb_model = SentenceTransformer(embed_model)
         self.prompt = ""
+        self.conversation_history = []
+        self.load_chat_model()
 
-    def get_embedding(self, text: str):
-        return self.emb_model.encode(text, show_progress_bar=False).tolist()
+    def load_chat_model(self):
+        url = f"{self.ollama_url}/api/generate"
+        body = {"model": self.chat_model}
+        requests.post(url, json=body, timeout=600)
 
     def search_qdrant(self, query: str, top_k: int = 6):
-        query_vec = self.get_embedding(query)
+        query_vec = self.emb_model.encode(query, show_progress_bar=False).tolist()
         url = f"http://{self.qdrant_host}:{self.qdrant_port}/collections/rag_collection/points/search"
         payload = {
             "vector": query_vec,
@@ -38,69 +41,156 @@ class LocalRAGSystem:
         results = resp.json().get("result", [])
         return results
 
-    def generate_answer(self, prompt: str):
-        url = f"{self.ollama_url}/api/generate"
-        body = {
-            "model": self.chat_model,
-            "prompt": prompt,
-            "stream": False
-        }
-        resp = requests.post(url, json=body, timeout=600)
-        if resp.status_code != 200:
-            return f"Ошибка генерации ответа: {resp.status_code} {resp.text}"
-        return resp.json().get("response", "").strip()
-
     def prepare_sources(self, context_docs: list):
         sources = ""
         for idx, doc in enumerate(context_docs, start=1):
             text = doc['payload'].get("text", "").strip()
-            sources = f"{sources}\n--- Source [{idx}] ---\n{text}\n"
-        return sources.strip()
+            sources = f"{sources}\n<source id=\"{idx}\">\n{text}\n</source>\n"
+        return sources
 
     def prepare_prompt(self, query: str, context_docs: list):
         sources = self.prepare_sources(context_docs)
         if os.path.exists('sys_prompt.txt'):
             with open('sys_prompt.txt', 'r') as fp:
-                return fp.read().replace("{{sources}}", sources).replace("{{query}}", query)
+                prompt_template = fp.read()
+                return prompt_template.replace("{{sources}}", sources).replace("{{query}}", query)
         else:
-            return f"""
-                Please provide an answer based solely on the provided sources.
-                It is prohibited to generate an answer based on your pretrained data.
-                If uncertain, ask the user for clarification.
-                Respond in the same language as the user's query.
-                If there are no sources in context, clearly state that.
-                If the context is unreadable or of poor quality, inform the user and provide the best possible answer.
-                When referencing information from a source, cite the appropriate source(s) using their corresponding numbers.
-                Every answer should include at least one source citation.
-                Only cite a source when you are explicitly referencing it.
+            return f"""### Your role
+You are a helpful assistant that can answer questions based on the provided sources.
 
-                If none of the sources are helpful, you should indicate that.
-                For example:
+### Your user
+User is a human who is asking a question related to the provided sources.
 
-                --- Source 1 ---
-                The sky is red in the evening and blue in the morning.
+### Your task
+Please provide an answer based solely on the provided sources and the conversation history.
 
-                --- Source 2 ---
-                Water is wet when the sky is red.
+### Rules
+- You **MUST** respond in the SAME language as the user's query.
+- If uncertain, you **MUST** the user for clarification.
+- If there are no sources in context, you **MUST** clearly state that.
+- If none of the sources are helpful, you **MUST** clearly state that.
+- If you are unsure about the answer, you **MUST** clearly state that.
+- If the context is unreadable or of poor quality, you **MUST** inform the user and provide the best possible answer.
+- When referencing information from a source, you **MUST** cite the appropriate source(s) using their corresponding numbers.
+- **Only include inline citations using [id] (e.g., [1], [2]) when the <source> tag includes an id attribute.**
+- You NEVER MUST NOT add <source> or any XML/HTML tags in your response.
+- You NEVER MUST NOT cite if the <source> tag does not contain an id attribute.
+- Every answer MAY include at least one source citation.
+- Only cite a source when you are explicitly referencing it.
+- You may also cite multiple sources if they are all relevant to the question.
+- Ensure citations are concise and directly related to the information provided.
+- You CAN format your responses using Markdown.
 
-                Query: When is water wet?
-                Answer: Water will be wet when the sky is red [2], which occurs in the evening [1].
+### Example of sources list:
 
-                Now it's your turn. Below are several numbered sources of information:
-                {context}
+```
+<source id="1">The sky is red in the evening and blue in the morning.</source>
+<source id="2">Water is wet when the sky is red.</source>
+<query>When is water wet?</query>
+```
+Response:
+```
+Water will be wet when the sky is red [2], which occurs in the evening [1].
+```
 
-                User query: {query}
-                Your answer:
-            """
+### Now let's start!
+
+```
+{sources}
+<query>{query}</query>
+```
+
+Respond."""
+
+    def generate_answer(self, prompt: str):
+        url = f"{self.ollama_url}/api/generate"
+        body = {
+            "model": self.chat_model,
+            "prompt": prompt,
+            "messages": self.conversation_history,
+            "stream": False,
+            # "options": {
+            #     "temperature": 0.4,
+            #     "top_p": 0.1,
+            # },
+        }
+        self.response = requests.post(url, json=body, timeout=600)
+        if self.response.status_code != 200:
+            return f"Ошибка генерации ответа: {self.response.status_code} {self.response.text}"
+        return self.response.json().get("response", "").strip()
+
+    def generate_answer_stream(self, prompt: str):
+        url = f"{self.ollama_url}/api/generate"
+        body = {
+            "model": self.chat_model,
+            "prompt": prompt,
+            "messages": self.conversation_history,
+            "stream": True
+        }
+        resp = requests.post(url, json=body, stream=True, timeout=600)
+        if resp.status_code != 200:
+            raise RuntimeError(f"Ошибка генерации ответа: {resp.status_code} {resp.text}")
+        full_answer = ""
+        for chunk in resp.iter_lines():
+            if chunk:
+                try:
+                    decoded_chunk = chunk.decode('utf-8')
+                    data = json.loads(decoded_chunk)
+                    if "response" in data:
+                        yield data["response"]
+                        full_answer += data["response"]
+                    elif "error" in data:
+                        print(f"Stream error: {data['error']}")
+                        break
+                except json.JSONDecodeError:
+                    print(f"Could not decode JSON from chunk: {chunk.decode('utf-8')}")
+                except Exception as e:
+                    print(f"Error processing chunk: {e}")
+
+    def get_prompt_eval_count(self):
+        return self.response.json().get("prompt_eval_count", 0)
+
+    def get_prompt_eval_duration(self):
+        return self.response.json().get("prompt_eval_duration", 0) / (10 ** 9)
+
+    def get_eval_count(self):
+        return self.response.json().get("eval_count", 0)
+
+    def get_eval_duration(self):
+        return self.response.json().get("eval_duration", 0) / (10 ** 9)
+
+    def get_total_duration(self):
+        return self.response.json().get("total_duration", 0) / (10 ** 9)
+
+    def get_tps(self):
+        eval_count = self.get_eval_count()
+        eval_duration = self.get_eval_duration()
+        if eval_count == 0 or eval_duration == 0:
+            return 0
+        return eval_count / eval_duration
 
 def print_sources(context_docs: list):
+    print("\n\nИсточники:")
     for idx, doc in enumerate(context_docs, start=1):
         filename = doc['payload'].get("filename", None)
         url = doc['payload'].get("url", None)
         title = filename
         if url is None:
             url = "(нет веб-ссылки)"
-        print(f"{idx}. {title}\n   {url}")
+        print(f"{idx}. {title}\n   {url}\n")
+
+def print_v(text: str, is_verbose: bool):
+    if is_verbose:
+        print(text)
+
+def print_stats(rag: RagSystem):
+    print("Статистика:")
+    print(f"* Time: {rag.get_total_duration()}s")
+    print(f"* TPS: {rag.get_tps()}")
+    print(f"* PEC: {rag.get_prompt_eval_count()}")
+    print(f"* PED: {rag.get_prompt_eval_duration()}s")
+    print(f"* EC: {rag.get_eval_count()}")
+    print(f"* ED: {rag.get_eval_duration()}s\n")
 
 def main():
     import sys
@@ -114,41 +204,43 @@ def main():
     parser.add_argument("--qdrant-port", type=int, default=6333, help="Qdrant port")
     parser.add_argument("--ollama-url", default="http://localhost:11434", help="Ollama API URL")
     parser.add_argument("--emb-model", default="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", help="Модель эмбеддинга")
-    parser.add_argument("--chat-model", default="qwen2.5:3b", help="Модель генерации Ollama")
+    parser.add_argument("--chat-model", default="phi4-mini:3.8b", help="Модель генерации Ollama")
     parser.add_argument("--topk", type=int, default=6, help="Количество документов для поиска")
+    parser.add_argument("--verbose", default=False, action=argparse.BooleanOptionalAction, help="Выводить промежуточные служебные сообщения")
+    parser.add_argument("--show-stats", default=False, action=argparse.BooleanOptionalAction, help="Выводить статистику об ответе (не работает с --stream)")
+    parser.add_argument("--stream", default=False, action=argparse.BooleanOptionalAction, help="Выводить статистику об ответе")
     args = parser.parse_args()
 
     if not args.query and not args.interactive:
         print("Ошибка: укажите запрос (--query) и/или используйте интерактивный режим (--interactive)")
         sys.exit(1)
 
-    print(f"Адрес ollama: {args.ollama_url}")
-    print(f"Адрес qdrant: {args.qdrant_host}:{args.qdrant_port}")
-    print(f"Модель эмбеддинга: {args.emb_model}")
-    print(f"Модель чата: {args.chat_model}")
-    print(f"Документов для поиска: {args.topk}")
+    print_v(f"Адрес ollama: {args.ollama_url}", args.verbose)
+    print_v(f"Адрес qdrant: {args.qdrant_host}:{args.qdrant_port}", args.verbose)
+    print_v(f"Модель эмбеддинга: {args.emb_model}", args.verbose)
+    print_v(f"Модель чата: {args.chat_model}", args.verbose)
+    print_v(f"Документов для поиска: {args.topk}", args.verbose)
     if os.path.exists('sys_prompt.txt'):
-        print("Будет использоваться sys_prompt.txt!")
+        print_v("Будет использоваться sys_prompt.txt!", args.verbose)
 
-    print("\nПервая инициализация моделей...")
-    rag = LocalRAGSystem(
+    print_v("\nПервая инициализация моделей...", args.verbose)
+    rag = RagSystem(
         ollama_url=args.ollama_url,
         qdrant_host=args.qdrant_host,
         qdrant_port=args.qdrant_port,
         embed_model=args.emb_model,
         chat_model=args.chat_model
     )
-    print(f"Модели загружены. Если ответ плохой, переформулируйте запрос, укажите --chat-model или улучшите исходные данные RAG")
+    print_v(f"Модели загружены. Если ответ плохой, переформулируйте запрос, укажите --chat-model или улучшите исходные данные RAG", args.verbose)
 
+    query = None
     if args.interactive:
-        print("\nИНТЕРАКТИВНЫЙ РЕЖИМ")
-        print("Можете вводить запрос (или 'exit' для выхода)\n")
+        print_v("\nИНТЕРАКТИВНЫЙ РЕЖИМ", args.verbose)
+        print_v("Можете вводить запрос (или 'exit' для выхода)\n", args.verbose)
 
     if args.query:
         query = args.query.strip()
         print(f">>> {query}")
-    else:
-        query = input(">>> ").strip()
 
     while True:
         try:
@@ -158,34 +250,103 @@ def main():
             if not query or query == "":
                 continue
 
-            if query.lower() == "exit":
-                print("\n*** Завершение работы")
+            if query.lower() == "help":
+                print("<<< Команды итерактивного режима:")
+                print("save -- сохранить диалог в файл")
+                print("exit -- выход\n")
+                query = None
+                continue
+
+            if query.strip().lower() == "save":
+                import datetime
+                timestamp = int(time.time())
+                dt = datetime.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%dT%H:%M:%SZ')
+                filename = f"chats/chat-{timestamp}.md"
+
+                markdown_content = f"# История диалога от {dt}\n\n"
+                markdown_content += f"## Параметры диалога\n"
+                markdown_content += f"```\nargs = {args}\n```\n"
+                markdown_content += f"```\nemb_model = {rag.emb_model}\n```\n"
+
+                for entry in rag.conversation_history:
+                    if entry['role'] == 'user':
+                        markdown_content += f"## Пользователь\n\n"
+                    elif entry['role'] == 'assistant':
+                        markdown_content += f"## Модель\n\n"
+                        docs = rag.prepare_sources(entry['docs']).replace("```", "")
+                        markdown_content += f"```\n{docs}\n```\n\n"
+                    markdown_content += f"{entry['content']}\n\n"
+
+                os.makedirs('chats', exist_ok=True)
+                with open(filename, 'w') as fp:
+                    fp.write(markdown_content)
+
+                print(f"<<< Диалог сохранён в файл: {filename}\n")
+                query = None
+                continue
+
+            if query.strip().lower() == "exit":
+                print_v("\n*** Завершение работы", args.verbose)
                 break
 
-            print("\nПоиск релевантных документов...")
+            print_v("\nПоиск релевантных документов...", args.verbose)
             context_docs = rag.search_qdrant(query, top_k=args.topk)
             if not context_docs:
-                print("Релевантные документы не найдены.")
+                print_v("Релевантные документы не найдены.", args.verbose)
                 if args.interactive:
+                    query = None
                     continue
                 else:
                     break
 
-            print(f"Найдено {len(context_docs)} релевантных документов:")
-            print_sources(context_docs)
+            print_v(f"Найдено {len(context_docs)} релевантных документов", args.verbose)
+            # print_sources(context_docs)
 
             prompt = rag.prepare_prompt(query=query, context_docs=context_docs)
             if args.show_prompt:
-                print("\nПолный системный промпт: --------------------------\n")
-                print(f"{prompt}\n---------------------------------------------------\n")
+                print("\nПолный системный промпт: --------------------------")
+                print(f"{prompt}\n---------------------------------------------------")
+
+            print_v("\nГенерация ответа...\n", args.verbose)
+
+            if args.stream:
+                answer = "\n<<< "
+                print(answer, end='', flush=True)
+                try:
+                    for message_part in rag.generate_answer_stream(prompt):
+                        answer += message_part
+                        print(message_part, end='', flush=True)
+                except RuntimeError as e:
+                    answer = str(e)
+                    print(f"\n{answer}\n===================================================\n")
+            else:
+                answer = rag.generate_answer(prompt)
+                print(f"<<< {answer}\n")
+
+            print_sources(context_docs)
+            if args.show_stats and not args.stream:
+                print_stats(rag)
+
+            rag.conversation_history.append({
+                "role": "user",
+                "content": query,
+            })
+
+            rag.conversation_history.append({
+                "role": "assistant",
+                "docs": context_docs,
+                "content": answer,
+            })
+
+            if args.interactive:
+                query = None
+            else:
+                break
 
-            print("Генерация ответа...")
-            answer = rag.generate_answer(prompt)
-            print(f"\n<<< {answer}\n===================================================\n")
-            query = None
         except KeyboardInterrupt:
             print("\n*** Завершение работы")
             break
+
         except Exception as e:
             print(f"Ошибка: {e}")
             break