from __future__ import annotations import asyncio from pathlib import Path from typing import AsyncGenerator from urllib.parse import urlparse import structlog from src.models import SimplifyCommand from src.models.constants import ARTICLE_NAME_INDEX, MIN_WIKI_PATH_PARTS, WIKI_PATH_INDEX class FileSource: def __init__(self, file_path: str) -> None: self.file_path = Path(file_path) self.logger = structlog.get_logger().bind(source="file", path=str(self.file_path)) async def read_urls( self, *, force_reprocess: bool = False ) -> AsyncGenerator[SimplifyCommand, None]: if not self.file_path.exists(): msg = f"Файл с URL не найден: {self.file_path}" raise FileNotFoundError(msg) self.logger.info("Начинаем чтение URL из файла") content = await asyncio.to_thread(self._read_file_sync) seen_urls = set() valid_count = 0 invalid_count = 0 for line_num, original_line in enumerate(content.splitlines(), 1): line = original_line.strip() if not line or line.startswith("#"): continue if not self._is_valid_wikipedia_url(line): self.logger.warning("Невалидный URL", line_number=line_num, url=line) invalid_count += 1 continue if line in seen_urls: self.logger.debug("Дубликат URL пропущен", line_number=line_num, url=line) continue seen_urls.add(line) valid_count += 1 yield SimplifyCommand(url=line, force_reprocess=force_reprocess) self.logger.info( "Завершено чтение URL", valid_count=valid_count, invalid_count=invalid_count, total_unique=len(seen_urls), ) def _read_file_sync(self) -> str: return self.file_path.read_text(encoding="utf-8") def _is_valid_wikipedia_url(self, url: str) -> bool: try: parsed = urlparse(url) if parsed.scheme not in ("http", "https"): return False if "wikipedia.org" not in parsed.netloc: return False path_parts = parsed.path.split("/") if len(path_parts) < MIN_WIKI_PATH_PARTS or path_parts[WIKI_PATH_INDEX] != "wiki": return False article_name = path_parts[ARTICLE_NAME_INDEX] return bool(article_name and article_name not in ("Main_Page", "Заглавная_страница")) except Exception: return False async def count_urls(self) -> int: count = 0 async for _ in self.read_urls(): count += 1 return count