2025-07-18 19:22:57 +02:00
|
|
|
|
import os
|
|
|
|
|
import re
|
|
|
|
|
from typing import List, Dict, Any
|
|
|
|
|
from dataclasses import dataclass
|
|
|
|
|
import markdown
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
class Document:
|
|
|
|
|
content: str
|
|
|
|
|
metadata: Dict[str, Any]
|
|
|
|
|
doc_id: str
|
|
|
|
|
source: str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class MarkdownLoader:
|
|
|
|
|
def __init__(self):
|
|
|
|
|
self.md = markdown.Markdown(extensions=["meta", "toc"])
|
|
|
|
|
|
|
|
|
|
def load_file(self, file_path: str) -> Document:
|
|
|
|
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
|
|
|
content = f.read()
|
|
|
|
|
|
|
|
|
|
title = self._extract_title(content)
|
|
|
|
|
doc_id = self._generate_doc_id(file_path)
|
|
|
|
|
|
|
|
|
|
metadata = self._extract_metadata(content, file_path)
|
|
|
|
|
metadata.update(
|
|
|
|
|
{
|
|
|
|
|
"title": title,
|
|
|
|
|
"doc_type": "case" if "case" in file_path.lower() else "info",
|
|
|
|
|
"source": file_path,
|
|
|
|
|
"file_size": len(content),
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
clean_content = self._clean_content(content)
|
|
|
|
|
|
|
|
|
|
return Document(
|
|
|
|
|
content=clean_content, metadata=metadata, doc_id=doc_id, source=file_path
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def load_directory(self, dir_path: str) -> List[Document]:
|
|
|
|
|
documents = []
|
|
|
|
|
|
|
|
|
|
for root, dirs, files in os.walk(dir_path):
|
|
|
|
|
for file in files:
|
|
|
|
|
if file.endswith(".md"):
|
|
|
|
|
file_path = os.path.join(root, file)
|
|
|
|
|
try:
|
|
|
|
|
doc = self.load_file(file_path)
|
|
|
|
|
documents.append(doc)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"Error loading {file_path}: {e}")
|
|
|
|
|
|
|
|
|
|
return documents
|
|
|
|
|
|
|
|
|
|
def _extract_title(self, content: str) -> str:
|
|
|
|
|
lines = content.strip().split("\n")
|
|
|
|
|
for line in lines:
|
|
|
|
|
line = line.strip()
|
|
|
|
|
if line.startswith("# "):
|
|
|
|
|
title = line[2:].strip()
|
|
|
|
|
if title and not title.startswith("["):
|
|
|
|
|
return title
|
|
|
|
|
|
|
|
|
|
return "Untitled"
|
|
|
|
|
|
|
|
|
|
def _generate_doc_id(self, file_path: str) -> str:
|
|
|
|
|
filename = os.path.basename(file_path)
|
|
|
|
|
name_without_ext = os.path.splitext(filename)[0]
|
|
|
|
|
return name_without_ext.replace(" ", "_").replace("-", "_").lower()
|
|
|
|
|
|
|
|
|
|
def _extract_metadata(self, content: str, file_path: str) -> Dict[str, Any]:
|
|
|
|
|
metadata = {}
|
|
|
|
|
|
|
|
|
|
filename = os.path.basename(file_path).lower()
|
|
|
|
|
content_lower = content.lower()
|
|
|
|
|
|
|
|
|
|
industry_mapping = {
|
|
|
|
|
"маркетинг": "marketing_agency",
|
|
|
|
|
"агентство": "marketing_agency",
|
|
|
|
|
"реклам": "marketing_agency",
|
|
|
|
|
"блогер": "marketing_agency",
|
|
|
|
|
"mediar": "marketing_agency",
|
|
|
|
|
"büro": "marketing_agency",
|
|
|
|
|
"логист": "logistics",
|
|
|
|
|
"достав": "logistics",
|
|
|
|
|
"склад": "logistics",
|
|
|
|
|
"грузчик": "logistics",
|
|
|
|
|
"разраб": "software",
|
|
|
|
|
"програм": "software",
|
|
|
|
|
"progkids": "software",
|
|
|
|
|
"it": "software",
|
|
|
|
|
"диджитал": "software",
|
|
|
|
|
"строит": "construction",
|
|
|
|
|
"недвиж": "construction",
|
|
|
|
|
"этажи": "construction",
|
|
|
|
|
"рознич": "retail",
|
|
|
|
|
"торгов": "retail",
|
|
|
|
|
"консалт": "consulting",
|
|
|
|
|
"экобренд": "manufacturing",
|
|
|
|
|
"wonder": "manufacturing",
|
|
|
|
|
"производ": "manufacturing",
|
|
|
|
|
"колл-центр": "call_center",
|
|
|
|
|
"звонки": "call_center",
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
industries = []
|
|
|
|
|
for keyword, industry in industry_mapping.items():
|
|
|
|
|
if keyword in content_lower or keyword in filename:
|
|
|
|
|
if industry not in industries:
|
|
|
|
|
industries.append(industry)
|
|
|
|
|
|
|
|
|
|
if not industries:
|
|
|
|
|
industries = ["other"]
|
|
|
|
|
|
|
|
|
|
metadata["industry"] = industries
|
|
|
|
|
|
|
|
|
|
roles_mapping = {
|
|
|
|
|
"технический директор": "tech",
|
|
|
|
|
"техн": "tech",
|
|
|
|
|
"cto": "tech",
|
|
|
|
|
"операционный директор": "ops",
|
|
|
|
|
"директор": "ceo",
|
|
|
|
|
"руководи": "ceo",
|
|
|
|
|
"основатель": "ceo",
|
|
|
|
|
"фин": "finance",
|
|
|
|
|
"бухгалт": "finance",
|
|
|
|
|
"cfo": "finance",
|
|
|
|
|
"операц": "ops",
|
|
|
|
|
"coo": "ops",
|
|
|
|
|
"hr": "hr",
|
|
|
|
|
"кадр": "hr",
|
|
|
|
|
"маркет": "marketing",
|
|
|
|
|
"продаж": "sales",
|
|
|
|
|
"менеджер": "other",
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
roles = []
|
|
|
|
|
for keyword, role in roles_mapping.items():
|
|
|
|
|
if keyword in content_lower:
|
|
|
|
|
if role not in roles:
|
|
|
|
|
roles.append(role)
|
|
|
|
|
|
|
|
|
|
if not roles:
|
|
|
|
|
roles = ["other"]
|
|
|
|
|
|
|
|
|
|
metadata["roles_relevant"] = roles
|
|
|
|
|
|
|
|
|
|
metrics = self._extract_metrics(content)
|
|
|
|
|
if metrics:
|
|
|
|
|
metadata["metrics"] = metrics
|
|
|
|
|
|
|
|
|
|
metadata["language"] = "ru"
|
|
|
|
|
|
|
|
|
|
import datetime
|
|
|
|
|
|
|
|
|
|
metadata["created_at"] = datetime.datetime.now().isoformat()
|
|
|
|
|
metadata["updated_at"] = datetime.datetime.now().isoformat()
|
|
|
|
|
|
|
|
|
|
return metadata
|
|
|
|
|
|
|
|
|
|
def _extract_metrics(self, content: str) -> Dict[str, Any]:
|
|
|
|
|
metrics = {}
|
|
|
|
|
|
|
|
|
|
time_patterns = [
|
|
|
|
|
(r"(\d+)\s*минут[ауы]?", "processing_minutes"),
|
|
|
|
|
(r"(\d+)\s*час[ауов]?", "processing_hours"),
|
|
|
|
|
(r"(\d+)\s*дн[ейяах]", "processing_days"),
|
|
|
|
|
(
|
|
|
|
|
r"с\s+(\d+)\s*дн[ейя]\s+до\s+(\d+)\s*минут",
|
|
|
|
|
"improvement_days_to_minutes",
|
|
|
|
|
),
|
|
|
|
|
(
|
|
|
|
|
r"с\s+(\d+)\s*час[ауов]?\s+до\s+(\d+)\s*минут",
|
|
|
|
|
"improvement_hours_to_minutes",
|
|
|
|
|
),
|
|
|
|
|
(r"(\d+)\s*секунд", "processing_seconds"),
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
for pattern, key in time_patterns:
|
|
|
|
|
matches = re.findall(pattern, content, re.IGNORECASE)
|
|
|
|
|
if matches:
|
|
|
|
|
try:
|
|
|
|
|
if key.startswith("improvement_"):
|
|
|
|
|
if len(matches[0]) == 2:
|
|
|
|
|
metrics[f"{key}_before"] = int(matches[0][0])
|
|
|
|
|
metrics[f"{key}_after"] = int(matches[0][1])
|
|
|
|
|
else:
|
|
|
|
|
metrics[key] = int(matches[0])
|
|
|
|
|
else:
|
|
|
|
|
metrics[key] = int(matches[0])
|
|
|
|
|
except (ValueError, IndexError):
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
percentage_patterns = [
|
|
|
|
|
(r"(\d+)%\s*снижени", "error_reduction_pct"),
|
|
|
|
|
(r"снижение[^0-9]*(\d+)%", "error_reduction_pct"),
|
|
|
|
|
(r"(\d+)%\s*документ", "document_collection_pct"),
|
|
|
|
|
(r"(\d+)%\s*точност", "accuracy_pct"),
|
|
|
|
|
(r"увеличи[лв]\w*\s+в\s+(\d+)\s*раз", "growth_multiplier"),
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
for pattern, key in percentage_patterns:
|
|
|
|
|
matches = re.findall(pattern, content, re.IGNORECASE)
|
|
|
|
|
if matches:
|
|
|
|
|
try:
|
|
|
|
|
metrics[key] = int(matches[0])
|
|
|
|
|
except ValueError:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
volume_patterns = [
|
|
|
|
|
(r"(\d+)\s*блогер", "bloggers_count"),
|
|
|
|
|
(r"(\d+)\s*исполнител", "contractors_count"),
|
|
|
|
|
(r"(\d+)\s*сотрудник", "employees_count"),
|
|
|
|
|
(r"бол[ьеее]+\s+(\d+)", "more_than_count"),
|
|
|
|
|
(r"свыше\s+(\d+)", "over_count"),
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
for pattern, key in volume_patterns:
|
|
|
|
|
matches = re.findall(pattern, content, re.IGNORECASE)
|
|
|
|
|
if matches:
|
|
|
|
|
try:
|
|
|
|
|
metrics[key] = int(matches[0])
|
|
|
|
|
except ValueError:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
return metrics
|
|
|
|
|
|
|
|
|
|
def _clean_content(self, content: str) -> str:
|
|
|
|
|
content = re.sub(r"^\s*#+\s*", "", content, flags=re.MULTILINE)
|
|
|
|
|
|
|
|
|
|
content = re.sub(r"\*\*(.*?)\*\*", r"\1", content)
|
|
|
|
|
content = re.sub(r"\*(.*?)\*", r"\1", content)
|
|
|
|
|
|
|
|
|
|
content = re.sub(r"\[([^\]]+)\]\([^\)]*\)", r"\1", content)
|
|
|
|
|
|
|
|
|
|
content = re.sub(r"^\s*[-*+]\s+", "• ", content, flags=re.MULTILINE)
|
|
|
|
|
|
|
|
|
|
content = re.sub(
|
|
|
|
|
r"\d+\s+\d+\s+\[Комментировать\]\(\)\s+\d{2}\.\d{2}\.\d{2}", "", content
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
noise_patterns = [
|
|
|
|
|
r"Автор и редактор журнала Консоль",
|
|
|
|
|
r"Автор\s+\[.*?\]\(\)",
|
|
|
|
|
r"Поделиться",
|
|
|
|
|
r"Ваше мнение\?",
|
|
|
|
|
r"Отлично\s+Хорошо\s+Нормально\s+Плохо\s+Ужасно",
|
|
|
|
|
r"Сайт использует файлы cookie.*?Принять",
|
|
|
|
|
r"\[Политика конфиденциальности\]\(\)",
|
|
|
|
|
r"\[Пользовательское соглашение\]\(\)",
|
|
|
|
|
r"hello@konsol\.pro",
|
|
|
|
|
r"\+7 \(\d{3}\) \d{3}-\d{2}-\d{2}",
|
|
|
|
|
r"125047.*?дом \d+",
|
2025-07-19 18:39:39 +02:00
|
|
|
|
r"\[Разработка - SKDO\]\(\)",
|
2025-07-18 19:22:57 +02:00
|
|
|
|
r"\[Подключиться к Консоли\]\(\)",
|
|
|
|
|
r"\[Кейсы наших клиентов\]\(\)",
|
|
|
|
|
r"\[Делимся экспертизой\]\(\)",
|
|
|
|
|
r"^\s*\d+\s*$",
|
|
|
|
|
r"^\s*\[\d+\]\(\)\s*\d{2}\.\d{2}\.\d{2}\s*$",
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
for pattern in noise_patterns:
|
|
|
|
|
content = re.sub(pattern, "", content, flags=re.MULTILINE | re.IGNORECASE)
|
|
|
|
|
|
|
|
|
|
related_articles_pattern = r"###\s+\[.*?\]\(\).*?(?=###|\Z)"
|
|
|
|
|
content = re.sub(related_articles_pattern, "", content, flags=re.DOTALL)
|
|
|
|
|
|
|
|
|
|
content = re.sub(r"\n{3,}", "\n\n", content)
|
|
|
|
|
|
|
|
|
|
lines = content.split("\n")
|
|
|
|
|
filtered_lines = []
|
|
|
|
|
for line in lines:
|
|
|
|
|
line = line.strip()
|
|
|
|
|
if line and not (line.startswith("[") and line.endswith("]()")):
|
|
|
|
|
if not re.match(r"^\d+\s*$", line):
|
|
|
|
|
if len(line) > 10 or line.startswith("•"):
|
|
|
|
|
filtered_lines.append(line)
|
|
|
|
|
|
|
|
|
|
content = "\n".join(filtered_lines)
|
|
|
|
|
|
|
|
|
|
return content.strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_platform_overview() -> Document:
|
|
|
|
|
content = """
|
|
|
|
|
Консоль.Про – платформа автоматизации работы с самозанятыми, ИП и физлицами.
|
|
|
|
|
|
|
|
|
|
Основные возможности:
|
|
|
|
|
• Подключение нового исполнителя за ~15 минут
|
|
|
|
|
• Выплаты в течение минут вместо часов
|
|
|
|
|
• Сбор 100% закрывающих документов
|
|
|
|
|
• Снижение ошибок до 95%
|
|
|
|
|
• Управление сотнями исполнителей одним сотрудником
|
|
|
|
|
• API интеграции для автоматизации процессов
|
|
|
|
|
• Автоматический сбор чеков и документов
|
|
|
|
|
• Снижение времени онбординга с 2 дней до ~20 минут
|
|
|
|
|
|
|
|
|
|
Платформа решает ключевые задачи бизнеса:
|
|
|
|
|
• Быстрое масштабирование команды исполнителей
|
|
|
|
|
• Автоматизация документооборота и выплат
|
|
|
|
|
• Снижение операционных затрат
|
|
|
|
|
• Обеспечение налогового соответствия
|
|
|
|
|
• Упрощение работы с подрядчиками
|
|
|
|
|
|
|
|
|
|
Внедрение платформы занимает около 1 дня.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
metadata = {
|
|
|
|
|
"title": "Платформа Консоль.Про - Обзор",
|
|
|
|
|
"doc_type": "platform_overview",
|
|
|
|
|
"source": "internal",
|
|
|
|
|
"industry": ["generic"],
|
|
|
|
|
"roles_relevant": ["tech", "finance", "ops", "ceo"],
|
|
|
|
|
"metrics": {
|
|
|
|
|
"onboarding_minutes": 15,
|
|
|
|
|
"onboarding_days_before": 2,
|
|
|
|
|
"onboarding_minutes_after": 20,
|
|
|
|
|
"error_reduction_pct": 95,
|
|
|
|
|
"document_collection_pct": 100,
|
|
|
|
|
"implementation_days": 1,
|
|
|
|
|
},
|
|
|
|
|
"language": "ru",
|
|
|
|
|
"created_at": "2024-01-01T00:00:00",
|
|
|
|
|
"updated_at": "2024-01-01T00:00:00",
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return Document(
|
|
|
|
|
content=content.strip(),
|
|
|
|
|
metadata=metadata,
|
|
|
|
|
doc_id="platform_overview",
|
|
|
|
|
source="platform_overview.md",
|
|
|
|
|
)
|