diff --git a/.gitignore b/.gitignore
index b329699..c37c8c2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,7 @@
.env
*.html
+*.pdf
*.sqlite*
!.gitkeep
diff --git a/rag/1_download_page.sh b/rag/1_download_html.sh
similarity index 84%
rename from rag/1_download_page.sh
rename to rag/1_download_html.sh
index 6efe1d9..9cc5686 100755
--- a/rag/1_download_page.sh
+++ b/rag/1_download_html.sh
@@ -17,7 +17,10 @@ source .env
[ -z "$CONF_PASSWORD" ] && { echo >&2 "Ошибка: CONF_PASSWORD не указан в файле .env"; exit 1; }
PAGE_ID="$1"
-API_ENDPOINT="${CONF_URL}/rest/api/content/${PAGE_ID}?expand=body.storage,children.page"
+API_ENDPOINT="${CONF_URL}/rest/api/content/${PAGE_ID}?expand=body.view,children.page"
+
+OUTPUT_PATH="./input_pdf"
+[ ! -d "$OUTPUT_PATH" ] && mkdir -p "$OUTPUT_PATH"
echo
echo "Загрузка: $API_ENDPOINT"
@@ -38,9 +41,15 @@ output_path="./input_html"
[ ! -d "$output_path" ] && mkdir -p "$output_path"
title=$(echo "$response" | jq -r .title)
-content=$(echo "$response" | jq -r .body.storage.value)
+content=$(echo "$response" | jq -r .body.view.value)
-echo "
Страница: $title
$content" > "$output_path/$title.html"
+if [ -z "$content" ]; then
+ echo "Пустая страница, пропущено"
+ exit
+fi
+
+path="$output_path/${title//\//_}.html"
+echo "Страница: $title
$content" > "$path"
echo "Сохранено: $output_path/$title.html"
child_ids=$(echo "$response" | jq -r '.children.page.results[]?.id' 2>/dev/null)
diff --git a/rag/1_download_pdf.sh b/rag/1_download_pdf.sh
new file mode 100755
index 0000000..b2232e9
--- /dev/null
+++ b/rag/1_download_pdf.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+
+command -v curl >/dev/null 2>&1 || { echo >&2 "Ошибка: curl не установлен"; exit 1; }
+command -v jq >/dev/null 2>&1 || { echo >&2 "Ошибка: jq не установлен"; exit 1; }
+
+if [ $# -lt 1 ]; then
+ echo >&2 "Ошибка: не указан ID страницы для загрузки"
+ echo "Использование: $0 "
+ exit 1
+fi
+
+[ ! -f .env ] && cp .env.example .env
+source .env
+
+[ -z "$CONF_URL" ] && { echo >&2 "Ошибка: CONF_URL не указан в файле .env"; exit 1; }
+[ -z "$CONF_USERNAME" ] && { echo >&2 "Ошибка: CONF_USERNAME не указан в файле .env"; exit 1; }
+[ -z "$CONF_PASSWORD" ] && { echo >&2 "Ошибка: CONF_PASSWORD не указан в файле .env"; exit 1; }
+
+PAGE_ID="$1"
+API_ENDPOINT="${CONF_URL}/spaces/flyingpdf/pdfpageexport.action?pageId=${PAGE_ID}"
+
+OUTPUT_PATH="./input_pdf"
+PDF_PATH="$OUTPUT_PATH/$PAGE_ID.pdf"
+[ ! -d "$OUTPUT_PATH" ] && mkdir -p "$OUTPUT_PATH"
+
+echo
+echo "Загрузка: $API_ENDPOINT"
+
+result=$(curl \
+ --silent \
+ --location \
+ --user "$CONF_USERNAME:$CONF_PASSWORD" \
+ --header "Accept: application/json" \
+ --output "$PDF_PATH" \
+ "${API_ENDPOINT}")
+
+if [ ! -f "$PDF_PATH" ]; then
+ echo "Ошибка $result"
+ exit
+fi
+
+API_ENDPOINT="${CONF_URL}/rest/api/content/${PAGE_ID}?expand=children.page"
+response=$(curl \
+ --silent \
+ --user "$CONF_USERNAME:$CONF_PASSWORD" \
+ --header "Accept: application/json" \
+ "${API_ENDPOINT}"
+)
+
+if [ $? -ne 0 ]; then
+ echo "$response"
+ exit 1
+fi
+
+error_message=$(echo "$response" | jq -r '.message' 2>/dev/null)
+if [ -n "$error_message" ] && [ "$error_message" != "null" ]; then
+ echo "$response"
+ exit 1
+fi
+
+title=$(echo "$response" | jq -r .title)
+PDF_TITLED_PATH="$OUTPUT_PATH/${title//\//_}.pdf"
+mv "$PDF_PATH" "$PDF_TITLED_PATH"
+echo "Сохранено: $PDF_TITLED_PATH"
+
+child_ids=$(echo "$response" | jq -r '.children.page.results[]?.id' 2>/dev/null)
+for child_id in $child_ids; do
+ echo "Переход к дочерней странице: $child_id"
+ $0 "$child_id"
+done
diff --git a/rag/3_rag.py b/rag/3_rag.py
index 47304ae..c10c988 100644
--- a/rag/3_rag.py
+++ b/rag/3_rag.py
@@ -444,7 +444,7 @@ def main():
parser.add_argument("--md-folder", default="output_md", help="Папка с markdown файлами")
parser.add_argument("--embed-model", default="nomic-embed-text", help="Модель для эмбеддингов")
parser.add_argument("--chat-model", default="phi4-mini:3.8b", help="Модель для чата")
- parser.add_argument("--results", type=int, default=10, help="Количество результатов поиска")
+ parser.add_argument("--results", type=int, default=6, help="Количество результатов поиска")
args = parser.parse_args()
diff --git a/rag/input_pdf/.gitkeep b/rag/input_pdf/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/rag/quickstart.sh b/rag/quickstart.sh
index e415036..c5e92f8 100755
--- a/rag/quickstart.sh
+++ b/rag/quickstart.sh
@@ -19,7 +19,7 @@ source venv/bin/activate
[ "$(pip install --dry-run beautifulsoup4 2>&1 | grep -c 'Would install')" -gt 0 ] && pip install beautifulsoup4
echo "Начало работы"
-../up
+cd ..; ./up; cd -
./1_download_page.sh "$@" || exit 1
python3 ./2_html_to_md.py
python3 ./3_rag.py --action build