Скрипты генерации rag

2025-08-18 09:01:26 +08:00
parent acad96a7b7
commit 64a63f048a
12 changed files with 1040 additions and 11 deletions
--- a/@rag/2_html_to_md.py
+++ b/@rag/2_html_to_md.py
@@ -0,0 +1,325 @@
+#!/usr/bin/env python3
+"""
+RAG System for Local Ollama
+Конвертирует html-файлы в markdown, очищая от лишней разметки
+Скрипт сгенерирован claude-sonnet-4
+"""
+
+import os
+import re
+import json
+from bs4 import BeautifulSoup
+from pathlib import Path
+
+
+def clean_confluence_html(soup):
+    """
+    Удаляет Confluence-специфичные элементы и очищает HTML.
+    """
+    # Удаляем Confluence макросы (structured-macro)
+    for macro in soup.find_all('ac:structured-macro'):
+        macro_name = macro.get('ac:name', '')
+
+        # Сохраняем содержимое некоторых макросов
+        if macro_name == 'note':
+            # Преобразуем заметки в блоки внимания
+            rich_text = macro.find('ac:rich-text-body')
+            if rich_text:
+                note_content = rich_text.get_text(strip=True)
+                note_tag = soup.new_tag('div', class_='note')
+                note_tag.string = f"📝 **Примечание:** {note_content}"
+                macro.replace_with(note_tag)
+            else:
+                macro.decompose()
+        elif macro_name == 'toc':
+            # Заменяем TOC на текст
+            toc_tag = soup.new_tag('div')
+            toc_tag.string = "**Содержание** (автогенерируется)"
+            macro.replace_with(toc_tag)
+        elif macro_name == 'drawio':
+            # Заменяем диаграммы на заглушку
+            diagram_name = macro.find('ac:parameter', {'ac:name': 'diagramName'})
+            if diagram_name:
+                diagram_text = diagram_name.get_text()
+            else:
+                diagram_text = 'Диаграмма'
+            diagram_tag = soup.new_tag('div')
+            diagram_tag.string = f"🖼️ **Диаграмма:** {diagram_text}"
+            macro.replace_with(diagram_tag)
+        else:
+            # Удаляем остальные макросы
+            macro.decompose()
+
+    # Удаляем другие Confluence элементы
+    for element in soup.find_all(True):
+        if element.name and element.name.startswith('ac:'):
+            element.decompose()
+
+    return soup
+
+
+def convert_table_to_markdown(table):
+    """
+    Конвертирует HTML таблицу в Markdown формат.
+    """
+    rows = table.find_all('tr')
+    if not rows:
+        return ""
+
+    markdown_lines = []
+
+    # Обработка первой строки как заголовка
+    first_row = rows[0]
+    header_cells = first_row.find_all(['th', 'td'])
+
+    if not header_cells:
+        return ""
+
+    # Заголовок таблицы
+    header_line = "|"
+    separator_line = "|"
+
+    for cell in header_cells:
+        # Получаем текст и очищаем его
+        cell_text = cell.get_text(separator=' ', strip=True)
+        cell_text = re.sub(r'\s+', ' ', cell_text)  # Заменяем множественные пробелы
+        cell_text = cell_text.replace('|', '\\|')   # Экранируем pipe символы
+
+        header_line += f" {cell_text} |"
+        separator_line += " --- |"
+
+    markdown_lines.append(header_line)
+    markdown_lines.append(separator_line)
+
+    # Обработка остальных строк
+    for row in rows[1:]:
+        data_cells = row.find_all(['td', 'th'])
+        if not data_cells:
+            continue
+
+        data_line = "|"
+        for i, cell in enumerate(data_cells):
+            if i >= len(header_cells):  # Не больше столбцов чем в заголовке
+                break
+
+            cell_text = cell.get_text(separator=' ', strip=True)
+            cell_text = re.sub(r'\s+', ' ', cell_text)
+            cell_text = cell_text.replace('|', '\\|')
+
+            data_line += f" {cell_text} |"
+
+        # Дополняем недостающие столбцы
+        missing_cols = len(header_cells) - len(data_cells)
+        for _ in range(missing_cols):
+            data_line += "  |"
+
+        markdown_lines.append(data_line)
+
+    return "\n".join(markdown_lines)
+
+
+def extract_json_blocks(soup):
+    """
+    Извлекает и форматирует JSON блоки.
+    """
+    json_blocks = []
+
+    # Ищем потенциальные JSON блоки в pre, code и script тегах
+    for element in soup.find_all(['pre', 'code', 'script']):
+        text_content = element.get_text(strip=True)
+
+        # Простая проверка на JSON
+        if text_content and (
+            (text_content.startswith('{') and text_content.endswith('}')) or
+            (text_content.startswith('[') and text_content.endswith(']'))
+        ):
+            try:
+                # Пытаемся парсить как JSON
+                json_data = json.loads(text_content)
+                formatted_json = json.dumps(json_data, indent=2, ensure_ascii=False)
+
+                # Заменяем элемент на форматированный JSON блок
+                json_tag = soup.new_tag('pre')
+                json_tag.string = f"```json\n{formatted_json}\n```"
+                element.replace_with(json_tag)
+                json_blocks.append(formatted_json)
+
+            except json.JSONDecodeError:
+                # Если не JSON, оставляем как code block
+                if element.name in ['pre', 'code']:
+                    element.string = f"```\n{text_content}\n```"
+
+    return json_blocks
+
+
+def html_to_markdown(html_content):
+    """
+    Основная функция конвертации HTML в Markdown.
+    """
+    soup = BeautifulSoup(html_content, 'html.parser')
+
+    # Удаляем скрипты и стили
+    for element in soup(['script', 'style']):
+        element.decompose()
+
+    # Очищаем Confluence элементы
+    soup = clean_confluence_html(soup)
+
+    # Извлекаем JSON блоки
+    json_blocks = extract_json_blocks(soup)
+
+    # Конвертируем таблицы
+    for table in soup.find_all('table'):
+        markdown_table = convert_table_to_markdown(table)
+        if markdown_table:
+            # Заменяем таблицу на Markdown
+            table_div = soup.new_tag('div', class_='markdown-table')
+            table_div.string = f"\n{markdown_table}\n"
+            table.replace_with(table_div)
+
+    # Обработка заголовков
+    for level in range(1, 7):
+        for header in soup.find_all(f'h{level}'):
+            header_text = header.get_text(strip=True)
+            markdown_header = '#' * level + ' ' + header_text
+            header.string = markdown_header
+
+    # Обработка списков
+    for ul in soup.find_all('ul'):
+        list_items = ul.find_all('li', recursive=False)
+        if list_items:
+            markdown_list = []
+            for li in list_items:
+                item_text = li.get_text(strip=True)
+                markdown_list.append(f"- {item_text}")
+            ul.string = '\n'.join(markdown_list)
+
+    for ol in soup.find_all('ol'):
+        list_items = ol.find_all('li', recursive=False)
+        if list_items:
+            markdown_list = []
+            for i, li in enumerate(list_items, 1):
+                item_text = li.get_text(strip=True)
+                markdown_list.append(f"{i}. {item_text}")
+            ol.string = '\n'.join(markdown_list)
+
+    # Обработка жирного и курсивного текста
+    for strong in soup.find_all(['strong', 'b']):
+        text = strong.get_text()
+        strong.string = f"**{text}**"
+
+    for em in soup.find_all(['em', 'i']):
+        text = em.get_text()
+        em.string = f"*{text}*"
+
+    # Получаем финальный текст
+    text = soup.get_text(separator='\n', strip=True)
+
+    # Постобработка
+    lines = []
+    for line in text.split('\n'):
+        line = line.strip()
+        if line:
+            lines.append(line)
+
+    # Убираем лишние пустые строки
+    result_lines = []
+    prev_empty = False
+
+    for line in lines:
+        if not line:
+            if not prev_empty:
+                result_lines.append('')
+            prev_empty = True
+        else:
+            result_lines.append(line)
+            prev_empty = False
+
+    return '\n'.join(result_lines)
+
+
+def process_files(input_dir, output_dir):
+    """
+    Обрабатывает все HTML-файлы в директории.
+    """
+    input_path = Path(input_dir)
+    output_path = Path(output_dir)
+
+    if not input_path.exists():
+        print(f"❌ Директория {input_dir} не найдена")
+        return
+
+    # Создаем выходную директорию
+    output_path.mkdir(parents=True, exist_ok=True)
+
+    html_files = list(input_path.glob('*.html'))
+
+    if not html_files:
+        print(f"❌ HTML файлы не найдены в {input_dir}")
+        return
+
+    print(f"📁 Найдено {len(html_files)} HTML файлов")
+
+    successful = 0
+    failed = 0
+    failed_files = []
+
+    for html_file in html_files:
+        print(f"🔄 Обработка: {html_file.name}")
+
+        try:
+            # Читаем HTML файл
+            with open(html_file, 'r', encoding='utf-8') as f:
+                html_content = f.read()
+
+            # Проверяем, что файл не пустой
+            if not html_content.strip():
+                print(f"⚠️ Пропущен: {html_file.name} (пустой файл)")
+                continue
+
+            # Конвертируем в Markdown
+            markdown_content = html_to_markdown(html_content)
+
+            # Проверяем результат конвертации
+            if not markdown_content.strip():
+                print(f"⚠️ Предупреждение: {html_file.name} - результат конвертации пустой")
+
+            # Сохраняем результат
+            md_filename = html_file.stem + '.md'
+            md_filepath = output_path / md_filename
+
+            with open(md_filepath, 'w', encoding='utf-8') as f:
+                f.write(markdown_content)
+
+            print(f"✅ Сохранено: {md_filename}")
+            successful += 1
+
+        except UnicodeDecodeError as e:
+            print(f"❌ Ошибка кодировки в {html_file.name}: {str(e)}")
+            failed += 1
+            failed_files.append((html_file.name, f"Ошибка кодировки: {str(e)}"))
+        except Exception as e:
+            print(f"❌ Ошибка при обработке {html_file.name}: {str(e)}")
+            failed += 1
+            failed_files.append((html_file.name, str(e)))
+
+    print(f"\n📊 Результат:")
+    print(f"✅ Успешно обработано: {successful}")
+    print(f"❌ Ошибок: {failed}")
+
+    if failed_files:
+        print(f"\n📋 Список файлов с ошибками:")
+        for filename, error in failed_files:
+            print(f"   • {filename}: {error}")
+
+    print(f"📂 Результаты сохранены в: {output_dir}")
+
+
+if __name__ == "__main__":
+    input_directory = "./input_html"
+    output_directory = "./output_md"
+
+    print("🚀 Запуск конвертера HTML → Markdown")
+    print("=" * 50)
+
+    process_files(input_directory, output_directory)