dataloader/src/dataloader/storage/repositories/opu.py

149 lines
4.7 KiB
Python
Raw Normal View History

2025-11-05 18:32:27 +01:00
"""Репозиторий для работы с данными OPU."""
from __future__ import annotations
from collections.abc import Sequence
from typing import Any
2025-11-06 19:54:40 +01:00
from sqlalchemy import DDL, text
2025-11-05 18:32:27 +01:00
from sqlalchemy.dialects.postgresql import insert as pg_insert
from sqlalchemy.ext.asyncio import AsyncSession
2025-11-06 19:54:40 +01:00
from dataloader.config import APP_CONFIG
2025-11-05 18:32:27 +01:00
from dataloader.storage.models import BriefDigitalCertificateOpu
class OpuRepository:
"""Репозиторий для работы с таблицей brief_digital_certificate_opu."""
def __init__(self, session: AsyncSession):
"""
Инициализация репозитория.
Args:
session: Асинхронная сессия SQLAlchemy
"""
self.s = session
2025-11-06 19:54:40 +01:00
self.schema = APP_CONFIG.pg.schema_opu
self.batch_size = APP_CONFIG.pg.batch_size
2025-11-05 18:32:27 +01:00
2025-11-06 19:54:40 +01:00
async def truncate(
self, *, cascade: bool = False, restart_identity: bool = True
) -> None:
2025-11-05 18:32:27 +01:00
"""
2025-11-06 19:54:40 +01:00
Быстро очищает таблицу, уважая имя схемы и безопасное квотирование для PostgreSQL.
2025-11-05 18:32:27 +01:00
2025-11-06 19:54:40 +01:00
Args:
cascade: добавляет CASCADE
restart_identity: добавляет RESTART IDENTITY
2025-11-05 18:32:27 +01:00
"""
2025-11-06 19:54:40 +01:00
table = BriefDigitalCertificateOpu.__table__
def quote_ident(name: str) -> str:
"""Экранирует кавычки и оборачивает имя в двойные."""
return f'"{name.replace("\"", "\"\"")}"'
schema_quoted = quote_ident(self.schema)
table_quoted = quote_ident(table.name)
full_table_name = f"{schema_quoted}.{table_quoted}"
opts = []
if restart_identity:
opts.append("RESTART IDENTITY")
if cascade:
opts.append("CASCADE")
2025-11-05 18:32:27 +01:00
2025-11-06 19:54:40 +01:00
suffix = f" {' '.join(opts)}" if opts else ""
2025-11-05 18:32:27 +01:00
2025-11-06 19:54:40 +01:00
await self.s.execute(text(f"TRUNCATE TABLE {full_table_name}{suffix}"))
await self.s.commit()
async def bulk_insert(
self, records: Sequence[dict[str, Any]], batch_size: int | None = None
) -> int:
2025-11-05 18:32:27 +01:00
"""
2025-11-06 19:54:40 +01:00
Массовая вставка записей в таблицу батчами.
2025-11-05 18:32:27 +01:00
Args:
records: Список словарей с данными для вставки
2025-11-06 19:54:40 +01:00
batch_size: Размер батча (default: из конфига PG_BATCH_SIZE)
2025-11-05 18:32:27 +01:00
Returns:
Количество вставленных записей
"""
if not records:
return 0
2025-11-06 19:54:40 +01:00
if batch_size is None:
batch_size = self.batch_size
total_inserted = 0
for i in range(0, len(records), batch_size):
batch = records[i : i + batch_size]
async with self.s.begin_nested():
stmt = pg_insert(BriefDigitalCertificateOpu).values(batch)
await self.s.execute(stmt)
await self.s.flush()
2025-11-05 18:32:27 +01:00
2025-11-06 19:54:40 +01:00
total_inserted += len(batch)
2025-11-05 18:32:27 +01:00
2025-11-06 19:54:40 +01:00
return total_inserted
async def bulk_upsert(
self, records: Sequence[dict[str, Any]], batch_size: int | None = None
) -> int:
2025-11-05 18:32:27 +01:00
"""
2025-11-06 19:54:40 +01:00
Массовая вставка/обновление записей (UPSERT) батчами.
2025-11-05 18:32:27 +01:00
Args:
records: Список словарей с данными
2025-11-06 19:54:40 +01:00
batch_size: Размер батча (default: из конфига PG_BATCH_SIZE)
2025-11-05 18:32:27 +01:00
Returns:
Количество обработанных записей
"""
if not records:
return 0
2025-11-06 19:54:40 +01:00
if batch_size is None:
batch_size = self.batch_size
2025-11-05 18:32:27 +01:00
update_columns = {
c.name
for c in BriefDigitalCertificateOpu.__table__.columns
if not c.primary_key
and c.name not in {"wf_load_id", "wf_load_dttm", "wf_row_id"}
}
2025-11-06 19:54:40 +01:00
total_upserted = 0
for i in range(0, len(records), batch_size):
batch = records[i : i + batch_size]
insert_stmt = pg_insert(BriefDigitalCertificateOpu).values(batch)
update_cols = {col: insert_stmt.excluded[col] for col in update_columns}
stmt = insert_stmt.on_conflict_do_update(
index_elements=[
"object_id",
"desk_nm",
"actdate",
"layer_cd",
"opu_cd",
"opu_lvl",
"opu_prnt_cd",
"object_unit",
],
set_=update_cols,
)
async with self.s.begin_nested():
await self.s.execute(stmt)
await self.s.flush()
total_upserted += len(batch)
return total_upserted