ai-email-assistant/src/ingest/loader.py

import os
import re
from typing import List, Dict, Any
from dataclasses import dataclass
import markdown


@dataclass
class Document:
    content: str
    metadata: Dict[str, Any]
    doc_id: str
    source: str


class MarkdownLoader:
    def __init__(self):
        self.md = markdown.Markdown(extensions=["meta", "toc"])

    def load_file(self, file_path: str) -> Document:
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()

        title = self._extract_title(content)
        doc_id = self._generate_doc_id(file_path)

        metadata = self._extract_metadata(content, file_path)
        metadata.update(
            {
                "title": title,
                "doc_type": "case" if "case" in file_path.lower() else "info",
                "source": file_path,
                "file_size": len(content),
            }
        )

        clean_content = self._clean_content(content)

        return Document(
            content=clean_content, metadata=metadata, doc_id=doc_id, source=file_path
        )

    def load_directory(self, dir_path: str) -> List[Document]:
        documents = []

        for root, dirs, files in os.walk(dir_path):
            for file in files:
                if file.endswith(".md"):
                    file_path = os.path.join(root, file)
                    try:
                        doc = self.load_file(file_path)
                        documents.append(doc)
                    except Exception as e:
                        print(f"Error loading {file_path}: {e}")

        return documents

    def _extract_title(self, content: str) -> str:
        lines = content.strip().split("\n")
        for line in lines:
            line = line.strip()
            if line.startswith("# "):
                title = line[2:].strip()
                if title and not title.startswith("["):
                    return title

        return "Untitled"

    def _generate_doc_id(self, file_path: str) -> str:
        filename = os.path.basename(file_path)
        name_without_ext = os.path.splitext(filename)[0]
        return name_without_ext.replace(" ", "_").replace("-", "_").lower()

    def _extract_metadata(self, content: str, file_path: str) -> Dict[str, Any]:
        metadata = {}

        filename = os.path.basename(file_path).lower()
        content_lower = content.lower()

        industry_mapping = {
            "маркетинг": "marketing_agency",
            "агентство": "marketing_agency",
            "реклам": "marketing_agency",
            "блогер": "marketing_agency",
            "mediar": "marketing_agency",
            "büro": "marketing_agency",
            "логист": "logistics",
            "достав": "logistics",
            "склад": "logistics",
            "грузчик": "logistics",
            "разраб": "software",
            "програм": "software",
            "progkids": "software",
            "it": "software",
            "диджитал": "software",
            "строит": "construction",
            "недвиж": "construction",
            "этажи": "construction",
            "рознич": "retail",
            "торгов": "retail",
            "консалт": "consulting",
            "экобренд": "manufacturing",
            "wonder": "manufacturing",
            "производ": "manufacturing",
            "колл-центр": "call_center",
            "звонки": "call_center",
        }

        industries = []
        for keyword, industry in industry_mapping.items():
            if keyword in content_lower or keyword in filename:
                if industry not in industries:
                    industries.append(industry)

        if not industries:
            industries = ["other"]

        metadata["industry"] = industries

        roles_mapping = {
            "технический директор": "tech",
            "техн": "tech",
            "cto": "tech",
            "операционный директор": "ops",
            "директор": "ceo",
            "руководи": "ceo",
            "основатель": "ceo",
            "фин": "finance",
            "бухгалт": "finance",
            "cfo": "finance",
            "операц": "ops",
            "coo": "ops",
            "hr": "hr",
            "кадр": "hr",
            "маркет": "marketing",
            "продаж": "sales",
            "менеджер": "other",
        }

        roles = []
        for keyword, role in roles_mapping.items():
            if keyword in content_lower:
                if role not in roles:
                    roles.append(role)

        if not roles:
            roles = ["other"]

        metadata["roles_relevant"] = roles

        metrics = self._extract_metrics(content)
        if metrics:
            metadata["metrics"] = metrics

        metadata["language"] = "ru"

        import datetime

        metadata["created_at"] = datetime.datetime.now().isoformat()
        metadata["updated_at"] = datetime.datetime.now().isoformat()

        return metadata

    def _extract_metrics(self, content: str) -> Dict[str, Any]:
        metrics = {}

        time_patterns = [
            (r"(\d+)\s*минут[ауы]?", "processing_minutes"),
            (r"(\d+)\s*час[ауов]?", "processing_hours"),
            (r"(\d+)\s*дн[ейяах]", "processing_days"),
            (
                r"с\s+(\d+)\s*дн[ейя]\s+до\s+(\d+)\s*минут",
                "improvement_days_to_minutes",
            ),
            (
                r"с\s+(\d+)\s*час[ауов]?\s+до\s+(\d+)\s*минут",
                "improvement_hours_to_minutes",
            ),
            (r"(\d+)\s*секунд", "processing_seconds"),
        ]

        for pattern, key in time_patterns:
            matches = re.findall(pattern, content, re.IGNORECASE)
            if matches:
                try:
                    if key.startswith("improvement_"):
                        if len(matches[0]) == 2:
                            metrics[f"{key}_before"] = int(matches[0][0])
                            metrics[f"{key}_after"] = int(matches[0][1])
                        else:
                            metrics[key] = int(matches[0])
                    else:
                        metrics[key] = int(matches[0])
                except (ValueError, IndexError):
                    pass

        percentage_patterns = [
            (r"(\d+)%\s*снижени", "error_reduction_pct"),
            (r"снижение[^0-9]*(\d+)%", "error_reduction_pct"),
            (r"(\d+)%\s*документ", "document_collection_pct"),
            (r"(\d+)%\s*точност", "accuracy_pct"),
            (r"увеличи[лв]\w*\s+в\s+(\d+)\s*раз", "growth_multiplier"),
        ]

        for pattern, key in percentage_patterns:
            matches = re.findall(pattern, content, re.IGNORECASE)
            if matches:
                try:
                    metrics[key] = int(matches[0])
                except ValueError:
                    pass

        volume_patterns = [
            (r"(\d+)\s*блогер", "bloggers_count"),
            (r"(\d+)\s*исполнител", "contractors_count"),
            (r"(\d+)\s*сотрудник", "employees_count"),
            (r"бол[ьеее]+\s+(\d+)", "more_than_count"),
            (r"свыше\s+(\d+)", "over_count"),
        ]

        for pattern, key in volume_patterns:
            matches = re.findall(pattern, content, re.IGNORECASE)
            if matches:
                try:
                    metrics[key] = int(matches[0])
                except ValueError:
                    pass

        return metrics

    def _clean_content(self, content: str) -> str:
        content = re.sub(r"^\s*#+\s*", "", content, flags=re.MULTILINE)

        content = re.sub(r"\*\*(.*?)\*\*", r"\1", content)
        content = re.sub(r"\*(.*?)\*", r"\1", content)

        content = re.sub(r"\[([^\]]+)\]\([^\)]*\)", r"\1", content)

        content = re.sub(r"^\s*[-*+]\s+", "• ", content, flags=re.MULTILINE)

        content = re.sub(
            r"\d+\s+\d+\s+\[Комментировать\]\(\)\s+\d{2}\.\d{2}\.\d{2}", "", content
        )

        noise_patterns = [
            r"Автор и редактор журнала Консоль",
            r"Автор\s+\[.*?\]\(\)",
            r"Поделиться",
            r"Ваше мнение\?",
            r"Отлично\s+Хорошо\s+Нормально\s+Плохо\s+Ужасно",
            r"Сайт использует файлы cookie.*?Принять",
            r"\[Политика конфиденциальности\]\(\)",
            r"\[Пользовательское соглашение\]\(\)",
            r"hello@konsol\.pro",
            r"\+7 \(\d{3}\) \d{3}-\d{2}-\d{2}",
            r"125047.*?дом \d+",
            r"\[Разработка - SKDO\]\(\)",
            r"\[Подключиться к Консоли\]\(\)",
            r"\[Кейсы наших клиентов\]\(\)",
            r"\[Делимся экспертизой\]\(\)",
            r"^\s*\d+\s*$",
            r"^\s*\[\d+\]\(\)\s*\d{2}\.\d{2}\.\d{2}\s*$",
        ]

        for pattern in noise_patterns:
            content = re.sub(pattern, "", content, flags=re.MULTILINE | re.IGNORECASE)

        related_articles_pattern = r"###\s+\[.*?\]\(\).*?(?=###|\Z)"
        content = re.sub(related_articles_pattern, "", content, flags=re.DOTALL)

        content = re.sub(r"\n{3,}", "\n\n", content)

        lines = content.split("\n")
        filtered_lines = []
        for line in lines:
            line = line.strip()
            if line and not (line.startswith("[") and line.endswith("]()")):
                if not re.match(r"^\d+\s*$", line):
                    if len(line) > 10 or line.startswith("•"):
                        filtered_lines.append(line)

        content = "\n".join(filtered_lines)

        return content.strip()


def create_platform_overview() -> Document:
    content = """
Консоль.Про – платформа автоматизации работы с самозанятыми, ИП и физлицами. 

Основные возможности:
• Подключение нового исполнителя за ~15 минут
• Выплаты в течение минут вместо часов
• Сбор 100% закрывающих документов
• Снижение ошибок до 95%
• Управление сотнями исполнителей одним сотрудником
• API интеграции для автоматизации процессов
• Автоматический сбор чеков и документов
• Снижение времени онбординга с 2 дней до ~20 минут

Платформа решает ключевые задачи бизнеса:
• Быстрое масштабирование команды исполнителей
• Автоматизация документооборота и выплат
• Снижение операционных затрат
• Обеспечение налогового соответствия
• Упрощение работы с подрядчиками

Внедрение платформы занимает около 1 дня.
    """

    metadata = {
        "title": "Платформа Консоль.Про - Обзор",
        "doc_type": "platform_overview",
        "source": "internal",
        "industry": ["generic"],
        "roles_relevant": ["tech", "finance", "ops", "ceo"],
        "metrics": {
            "onboarding_minutes": 15,
            "onboarding_days_before": 2,
            "onboarding_minutes_after": 20,
            "error_reduction_pct": 95,
            "document_collection_pct": 100,
            "implementation_days": 1,
        },
        "language": "ru",
        "created_at": "2024-01-01T00:00:00",
        "updated_at": "2024-01-01T00:00:00",
    }

    return Document(
        content=content.strip(),
        metadata=metadata,
        doc_id="platform_overview",
        source="platform_overview.md",
    )