ruwiki-test/src/sources.py

89 lines
2.8 KiB
Python
Raw Normal View History

2025-07-11 21:28:58 +02:00
from __future__ import annotations
import asyncio
from pathlib import Path
from typing import AsyncGenerator
from urllib.parse import urlparse
import structlog
from src.models import SimplifyCommand
from src.models.constants import ARTICLE_NAME_INDEX, MIN_WIKI_PATH_PARTS, WIKI_PATH_INDEX
class FileSource:
def __init__(self, file_path: str) -> None:
self.file_path = Path(file_path)
self.logger = structlog.get_logger().bind(source="file", path=str(self.file_path))
async def read_urls(
self, *, force_reprocess: bool = False
) -> AsyncGenerator[SimplifyCommand, None]:
if not self.file_path.exists():
msg = f"Файл с URL не найден: {self.file_path}"
raise FileNotFoundError(msg)
self.logger.info("Начинаем чтение URL из файла")
content = await asyncio.to_thread(self._read_file_sync)
seen_urls = set()
valid_count = 0
invalid_count = 0
for line_num, original_line in enumerate(content.splitlines(), 1):
line = original_line.strip()
if not line or line.startswith("#"):
continue
if not self._is_valid_wikipedia_url(line):
self.logger.warning("Невалидный URL", line_number=line_num, url=line)
invalid_count += 1
continue
if line in seen_urls:
self.logger.debug("Дубликат URL пропущен", line_number=line_num, url=line)
continue
seen_urls.add(line)
valid_count += 1
yield SimplifyCommand(url=line, force_reprocess=force_reprocess)
self.logger.info(
"Завершено чтение URL",
valid_count=valid_count,
invalid_count=invalid_count,
total_unique=len(seen_urls),
)
def _read_file_sync(self) -> str:
return self.file_path.read_text(encoding="utf-8")
def _is_valid_wikipedia_url(self, url: str) -> bool:
try:
parsed = urlparse(url)
if parsed.scheme not in ("http", "https"):
return False
if "wikipedia.org" not in parsed.netloc:
return False
path_parts = parsed.path.split("/")
if len(path_parts) < MIN_WIKI_PATH_PARTS or path_parts[WIKI_PATH_INDEX] != "wiki":
return False
article_name = path_parts[ARTICLE_NAME_INDEX]
return bool(article_name and article_name not in ("Main_Page", "Заглавная_страница"))
except Exception:
return False
async def count_urls(self) -> int:
count = 0
async for _ in self.read_urls():
count += 1
return count