ai-email-assistant/src/ingest/loader.py

336 lines
12 KiB
Python
Raw Normal View History

2025-07-18 19:22:57 +02:00
import os
import re
from typing import List, Dict, Any
from dataclasses import dataclass
import markdown
@dataclass
class Document:
content: str
metadata: Dict[str, Any]
doc_id: str
source: str
class MarkdownLoader:
def __init__(self):
self.md = markdown.Markdown(extensions=["meta", "toc"])
def load_file(self, file_path: str) -> Document:
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
title = self._extract_title(content)
doc_id = self._generate_doc_id(file_path)
metadata = self._extract_metadata(content, file_path)
metadata.update(
{
"title": title,
"doc_type": "case" if "case" in file_path.lower() else "info",
"source": file_path,
"file_size": len(content),
}
)
clean_content = self._clean_content(content)
return Document(
content=clean_content, metadata=metadata, doc_id=doc_id, source=file_path
)
def load_directory(self, dir_path: str) -> List[Document]:
documents = []
for root, dirs, files in os.walk(dir_path):
for file in files:
if file.endswith(".md"):
file_path = os.path.join(root, file)
try:
doc = self.load_file(file_path)
documents.append(doc)
except Exception as e:
print(f"Error loading {file_path}: {e}")
return documents
def _extract_title(self, content: str) -> str:
lines = content.strip().split("\n")
for line in lines:
line = line.strip()
if line.startswith("# "):
title = line[2:].strip()
if title and not title.startswith("["):
return title
return "Untitled"
def _generate_doc_id(self, file_path: str) -> str:
filename = os.path.basename(file_path)
name_without_ext = os.path.splitext(filename)[0]
return name_without_ext.replace(" ", "_").replace("-", "_").lower()
def _extract_metadata(self, content: str, file_path: str) -> Dict[str, Any]:
metadata = {}
filename = os.path.basename(file_path).lower()
content_lower = content.lower()
industry_mapping = {
"маркетинг": "marketing_agency",
"агентство": "marketing_agency",
"реклам": "marketing_agency",
"блогер": "marketing_agency",
"mediar": "marketing_agency",
"büro": "marketing_agency",
"логист": "logistics",
"достав": "logistics",
"склад": "logistics",
"грузчик": "logistics",
"разраб": "software",
"програм": "software",
"progkids": "software",
"it": "software",
"диджитал": "software",
"строит": "construction",
"недвиж": "construction",
"этажи": "construction",
"рознич": "retail",
"торгов": "retail",
"консалт": "consulting",
"экобренд": "manufacturing",
"wonder": "manufacturing",
"производ": "manufacturing",
"колл-центр": "call_center",
"звонки": "call_center",
}
industries = []
for keyword, industry in industry_mapping.items():
if keyword in content_lower or keyword in filename:
if industry not in industries:
industries.append(industry)
if not industries:
industries = ["other"]
metadata["industry"] = industries
roles_mapping = {
"технический директор": "tech",
"техн": "tech",
"cto": "tech",
"операционный директор": "ops",
"директор": "ceo",
"руководи": "ceo",
"основатель": "ceo",
"фин": "finance",
"бухгалт": "finance",
"cfo": "finance",
"операц": "ops",
"coo": "ops",
"hr": "hr",
"кадр": "hr",
"маркет": "marketing",
"продаж": "sales",
"менеджер": "other",
}
roles = []
for keyword, role in roles_mapping.items():
if keyword in content_lower:
if role not in roles:
roles.append(role)
if not roles:
roles = ["other"]
metadata["roles_relevant"] = roles
metrics = self._extract_metrics(content)
if metrics:
metadata["metrics"] = metrics
metadata["language"] = "ru"
import datetime
metadata["created_at"] = datetime.datetime.now().isoformat()
metadata["updated_at"] = datetime.datetime.now().isoformat()
return metadata
def _extract_metrics(self, content: str) -> Dict[str, Any]:
metrics = {}
time_patterns = [
(r"(\d+)\s*минут[ауы]?", "processing_minutes"),
(r"(\d+)\s*час[ауов]?", "processing_hours"),
(r"(\d+)\s*дн[ейяах]", "processing_days"),
(
r"с\s+(\d+)\s*дн[ейя]\s+до\s+(\d+)\s*минут",
"improvement_days_to_minutes",
),
(
r"с\s+(\d+)\s*час[ауов]?\s+до\s+(\d+)\s*минут",
"improvement_hours_to_minutes",
),
(r"(\d+)\s*секунд", "processing_seconds"),
]
for pattern, key in time_patterns:
matches = re.findall(pattern, content, re.IGNORECASE)
if matches:
try:
if key.startswith("improvement_"):
if len(matches[0]) == 2:
metrics[f"{key}_before"] = int(matches[0][0])
metrics[f"{key}_after"] = int(matches[0][1])
else:
metrics[key] = int(matches[0])
else:
metrics[key] = int(matches[0])
except (ValueError, IndexError):
pass
percentage_patterns = [
(r"(\d+)%\s*снижени", "error_reduction_pct"),
(r"снижение[^0-9]*(\d+)%", "error_reduction_pct"),
(r"(\d+)%\s*документ", "document_collection_pct"),
(r"(\d+)%\s*точност", "accuracy_pct"),
(r"увеличи[лв]\w*\s+в\s+(\d+)\s*раз", "growth_multiplier"),
]
for pattern, key in percentage_patterns:
matches = re.findall(pattern, content, re.IGNORECASE)
if matches:
try:
metrics[key] = int(matches[0])
except ValueError:
pass
volume_patterns = [
(r"(\d+)\s*блогер", "bloggers_count"),
(r"(\d+)\s*исполнител", "contractors_count"),
(r"(\d+)\s*сотрудник", "employees_count"),
(r"бол[ьеее]+\s+(\d+)", "more_than_count"),
(r"свыше\s+(\d+)", "over_count"),
]
for pattern, key in volume_patterns:
matches = re.findall(pattern, content, re.IGNORECASE)
if matches:
try:
metrics[key] = int(matches[0])
except ValueError:
pass
return metrics
def _clean_content(self, content: str) -> str:
content = re.sub(r"^\s*#+\s*", "", content, flags=re.MULTILINE)
content = re.sub(r"\*\*(.*?)\*\*", r"\1", content)
content = re.sub(r"\*(.*?)\*", r"\1", content)
content = re.sub(r"\[([^\]]+)\]\([^\)]*\)", r"\1", content)
content = re.sub(r"^\s*[-*+]\s+", "", content, flags=re.MULTILINE)
content = re.sub(
r"\d+\s+\d+\s+\[Комментировать\]\(\)\s+\d{2}\.\d{2}\.\d{2}", "", content
)
noise_patterns = [
r"Автор и редактор журнала Консоль",
r"Автор\s+\[.*?\]\(\)",
r"Поделиться",
r"Ваше мнение\?",
r"Отлично\s+Хорошо\s+Нормально\s+Плохо\s+Ужасно",
r"Сайт использует файлы cookie.*?Принять",
r"\[Политика конфиденциальности\]\(\)",
r"\[Пользовательское соглашение\]\(\)",
r"hello@konsol\.pro",
r"\+7 \(\d{3}\) \d{3}-\d{2}-\d{2}",
r"125047.*?дом \d+",
r"\[Разработка — SKDO\]\(\)",
r"\[Подключиться к Консоли\]\(\)",
r"\[Кейсы наших клиентов\]\(\)",
r"\[Делимся экспертизой\]\(\)",
r"^\s*\d+\s*$",
r"^\s*\[\d+\]\(\)\s*\d{2}\.\d{2}\.\d{2}\s*$",
]
for pattern in noise_patterns:
content = re.sub(pattern, "", content, flags=re.MULTILINE | re.IGNORECASE)
related_articles_pattern = r"###\s+\[.*?\]\(\).*?(?=###|\Z)"
content = re.sub(related_articles_pattern, "", content, flags=re.DOTALL)
content = re.sub(r"\n{3,}", "\n\n", content)
lines = content.split("\n")
filtered_lines = []
for line in lines:
line = line.strip()
if line and not (line.startswith("[") and line.endswith("]()")):
if not re.match(r"^\d+\s*$", line):
if len(line) > 10 or line.startswith(""):
filtered_lines.append(line)
content = "\n".join(filtered_lines)
return content.strip()
def create_platform_overview() -> Document:
content = """
Консоль.Про платформа автоматизации работы с самозанятыми, ИП и физлицами.
Основные возможности:
Подключение нового исполнителя за ~15 минут
Выплаты в течение минут вместо часов
Сбор 100% закрывающих документов
Снижение ошибок до 95%
Управление сотнями исполнителей одним сотрудником
API интеграции для автоматизации процессов
Автоматический сбор чеков и документов
Снижение времени онбординга с 2 дней до ~20 минут
Платформа решает ключевые задачи бизнеса:
Быстрое масштабирование команды исполнителей
Автоматизация документооборота и выплат
Снижение операционных затрат
Обеспечение налогового соответствия
Упрощение работы с подрядчиками
Внедрение платформы занимает около 1 дня.
"""
metadata = {
"title": "Платформа Консоль.Про - Обзор",
"doc_type": "platform_overview",
"source": "internal",
"industry": ["generic"],
"roles_relevant": ["tech", "finance", "ops", "ceo"],
"metrics": {
"onboarding_minutes": 15,
"onboarding_days_before": 2,
"onboarding_minutes_after": 20,
"error_reduction_pct": 95,
"document_collection_pct": 100,
"implementation_days": 1,
},
"language": "ru",
"created_at": "2024-01-01T00:00:00",
"updated_at": "2024-01-01T00:00:00",
}
return Document(
content=content.strip(),
metadata=metadata,
doc_id="platform_overview",
source="platform_overview.md",
)