From 36e278529045944061284ebe7171ef5d83fbbb1c Mon Sep 17 00:00:00 2001 From: AnthonyAxenov Date: Thu, 21 Aug 2025 21:17:51 +0800 Subject: [PATCH] =?UTF-8?q?=D0=9F=D0=BE=D0=BB=D1=83=D1=87=D0=B5=D0=BD?= =?UTF-8?q?=D0=B8=D0=B5=20pdf,=20=D1=84=D0=B8=D0=BA=D1=81=20=D0=B8=D0=BC?= =?UTF-8?q?=D0=B5=D0=BD=D0=B8=20=D1=84=D0=B0=D0=B9=D0=BB=D0=B0=20=D0=BF?= =?UTF-8?q?=D1=80=D0=B8=20=D0=BD=D0=B0=D0=BB=D0=B8=D1=87=D0=B8=D0=B8=20?= =?UTF-8?q?=D1=81=D0=BB=D0=B5=D1=88=D0=B0,=20=D1=84=D0=B8=D0=BA=D1=81=20qu?= =?UTF-8?q?ickstart=20=D0=B8=20=D0=BC=D0=B5=D0=BB=D0=BE=D1=87=D0=B8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + ...{1_download_page.sh => 1_download_html.sh} | 15 +++- rag/1_download_pdf.sh | 70 +++++++++++++++++++ rag/3_rag.py | 2 +- rag/input_pdf/.gitkeep | 0 rag/quickstart.sh | 2 +- 6 files changed, 85 insertions(+), 5 deletions(-) rename rag/{1_download_page.sh => 1_download_html.sh} (84%) create mode 100755 rag/1_download_pdf.sh create mode 100644 rag/input_pdf/.gitkeep diff --git a/.gitignore b/.gitignore index b329699..c37c8c2 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ .env *.html +*.pdf *.sqlite* !.gitkeep diff --git a/rag/1_download_page.sh b/rag/1_download_html.sh similarity index 84% rename from rag/1_download_page.sh rename to rag/1_download_html.sh index 6efe1d9..9cc5686 100755 --- a/rag/1_download_page.sh +++ b/rag/1_download_html.sh @@ -17,7 +17,10 @@ source .env [ -z "$CONF_PASSWORD" ] && { echo >&2 "Ошибка: CONF_PASSWORD не указан в файле .env"; exit 1; } PAGE_ID="$1" -API_ENDPOINT="${CONF_URL}/rest/api/content/${PAGE_ID}?expand=body.storage,children.page" +API_ENDPOINT="${CONF_URL}/rest/api/content/${PAGE_ID}?expand=body.view,children.page" + +OUTPUT_PATH="./input_pdf" +[ ! -d "$OUTPUT_PATH" ] && mkdir -p "$OUTPUT_PATH" echo echo "Загрузка: $API_ENDPOINT" @@ -38,9 +41,15 @@ output_path="./input_html" [ ! -d "$output_path" ] && mkdir -p "$output_path" title=$(echo "$response" | jq -r .title) -content=$(echo "$response" | jq -r .body.storage.value) +content=$(echo "$response" | jq -r .body.view.value) -echo "Страница: $title

$content" > "$output_path/$title.html" +if [ -z "$content" ]; then + echo "Пустая страница, пропущено" + exit +fi + +path="$output_path/${title//\//_}.html" +echo "Страница: $title

$content" > "$path" echo "Сохранено: $output_path/$title.html" child_ids=$(echo "$response" | jq -r '.children.page.results[]?.id' 2>/dev/null) diff --git a/rag/1_download_pdf.sh b/rag/1_download_pdf.sh new file mode 100755 index 0000000..b2232e9 --- /dev/null +++ b/rag/1_download_pdf.sh @@ -0,0 +1,70 @@ +#!/bin/bash + +command -v curl >/dev/null 2>&1 || { echo >&2 "Ошибка: curl не установлен"; exit 1; } +command -v jq >/dev/null 2>&1 || { echo >&2 "Ошибка: jq не установлен"; exit 1; } + +if [ $# -lt 1 ]; then + echo >&2 "Ошибка: не указан ID страницы для загрузки" + echo "Использование: $0 " + exit 1 +fi + +[ ! -f .env ] && cp .env.example .env +source .env + +[ -z "$CONF_URL" ] && { echo >&2 "Ошибка: CONF_URL не указан в файле .env"; exit 1; } +[ -z "$CONF_USERNAME" ] && { echo >&2 "Ошибка: CONF_USERNAME не указан в файле .env"; exit 1; } +[ -z "$CONF_PASSWORD" ] && { echo >&2 "Ошибка: CONF_PASSWORD не указан в файле .env"; exit 1; } + +PAGE_ID="$1" +API_ENDPOINT="${CONF_URL}/spaces/flyingpdf/pdfpageexport.action?pageId=${PAGE_ID}" + +OUTPUT_PATH="./input_pdf" +PDF_PATH="$OUTPUT_PATH/$PAGE_ID.pdf" +[ ! -d "$OUTPUT_PATH" ] && mkdir -p "$OUTPUT_PATH" + +echo +echo "Загрузка: $API_ENDPOINT" + +result=$(curl \ + --silent \ + --location \ + --user "$CONF_USERNAME:$CONF_PASSWORD" \ + --header "Accept: application/json" \ + --output "$PDF_PATH" \ + "${API_ENDPOINT}") + +if [ ! -f "$PDF_PATH" ]; then + echo "Ошибка $result" + exit +fi + +API_ENDPOINT="${CONF_URL}/rest/api/content/${PAGE_ID}?expand=children.page" +response=$(curl \ + --silent \ + --user "$CONF_USERNAME:$CONF_PASSWORD" \ + --header "Accept: application/json" \ + "${API_ENDPOINT}" +) + +if [ $? -ne 0 ]; then + echo "$response" + exit 1 +fi + +error_message=$(echo "$response" | jq -r '.message' 2>/dev/null) +if [ -n "$error_message" ] && [ "$error_message" != "null" ]; then + echo "$response" + exit 1 +fi + +title=$(echo "$response" | jq -r .title) +PDF_TITLED_PATH="$OUTPUT_PATH/${title//\//_}.pdf" +mv "$PDF_PATH" "$PDF_TITLED_PATH" +echo "Сохранено: $PDF_TITLED_PATH" + +child_ids=$(echo "$response" | jq -r '.children.page.results[]?.id' 2>/dev/null) +for child_id in $child_ids; do + echo "Переход к дочерней странице: $child_id" + $0 "$child_id" +done diff --git a/rag/3_rag.py b/rag/3_rag.py index 47304ae..c10c988 100644 --- a/rag/3_rag.py +++ b/rag/3_rag.py @@ -444,7 +444,7 @@ def main(): parser.add_argument("--md-folder", default="output_md", help="Папка с markdown файлами") parser.add_argument("--embed-model", default="nomic-embed-text", help="Модель для эмбеддингов") parser.add_argument("--chat-model", default="phi4-mini:3.8b", help="Модель для чата") - parser.add_argument("--results", type=int, default=10, help="Количество результатов поиска") + parser.add_argument("--results", type=int, default=6, help="Количество результатов поиска") args = parser.parse_args() diff --git a/rag/input_pdf/.gitkeep b/rag/input_pdf/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/rag/quickstart.sh b/rag/quickstart.sh index e415036..c5e92f8 100755 --- a/rag/quickstart.sh +++ b/rag/quickstart.sh @@ -19,7 +19,7 @@ source venv/bin/activate [ "$(pip install --dry-run beautifulsoup4 2>&1 | grep -c 'Would install')" -gt 0 ] && pip install beautifulsoup4 echo "Начало работы" -../up +cd ..; ./up; cd - ./1_download_page.sh "$@" || exit 1 python3 ./2_html_to_md.py python3 ./3_rag.py --action build