# services/result_processor.py
from typing import Optional, Dict
from dataclasses import dataclass
from adapters.base import RawDocument
from services.ai_filter import AIFilter
from services.ai_classifier import AIClassifier
from services.ai_summarizer import AISummarizer
from services.document_formatter import DocumentFormatter
from services.alert_engine import AlertEngine
from core.knowledge_base import kb_client
from core.database import db_session
from models import LexDocument, LexAlert
@dataclass
class ProcessingResult:
"""Результат обработки документа."""
success: bool
document_id: Optional[int] = None
reject_reason: Optional[str] = None
error: Optional[str] = None
class ResultProcessor:
"""Обработчик результатов парÑинга."""
def __init__(self):
self.ai_filter = AIFilter()
self.ai_classifier = AIClassifier()
self.ai_summarizer = AISummarizer()
self.formatter = DocumentFormatter()
self.alert_engine = AlertEngine()
async def process(self, raw_document: RawDocument) -> ProcessingResult:
"""Обработка ÑпарÑенного документа."""
# 1. ВалидациÑ
validation_error = self._validate(raw_document)
if validation_error:
return ProcessingResult(success=False, error=validation_error)
# 2. Проверка дубликатов
if await self._is_duplicate(raw_document):
return ProcessingResult(success=False, reject_reason="duplicate")
# 3. AI-фильтрациÑ
relevance = await self.ai_filter.evaluate(raw_document.text)
if not relevance.is_relevant:
await self._log_rejected(raw_document, relevance.score)
return ProcessingResult(
success=False,
reject_reason=f"low_relevance ({relevance.score:.2f})"
)
# 4. AI-клаÑÑификациÑ
classification = await self.ai_classifier.classify(
raw_document.text,
raw_document.title
)
# 5. AI-резюмирование
summary = await self.ai_summarizer.summarize(
raw_document.text,
raw_document.title,
classification.category
)
# 6. Формирование Markdown
markdown = self.formatter.format(
raw_document=raw_document,
classification=classification,
summary=summary,
relevance_score=relevance.score
)
# 7. Загрузка в Knowledge Base
kb_id = await kb_client.upload(
content=markdown,
metadata={
"source": raw_document.source,
"category": classification.category,
"access_level": "manager",
"brand_id": "shared"
}
)
# 8. Сохранение метаданных
document_id = await self._save_document(
raw_document=raw_document,
classification=classification,
summary=summary,
relevance_score=relevance.score,
kb_id=kb_id
)
# 9. Создание алерта
await self.alert_engine.create_alert(
document_id=document_id,
title=raw_document.title,
category=classification.category,
relevance=classification.relevance_level,
summary=summary.short_summary
)
return ProcessingResult(success=True, document_id=document_id)
def _validate(self, doc: RawDocument) -> Optional[str]:
"""Ð’Ð°Ð»Ð¸Ð´Ð°Ñ†Ð¸Ñ Ð´Ð¾ÐºÑƒÐ¼ÐµÐ½Ñ‚Ð°."""
if not doc.text or len(doc.text) < 100:
return "Text too short or empty"
if not doc.title:
return "Title is missing"
if len(doc.text) > 1_000_000:
return "Text too long (>1MB)"
return None
async def _is_duplicate(self, doc: RawDocument) -> bool:
"""Проверка на дубликат."""
async with db_session() as session:
result = await session.execute(
"""
SELECT COUNT(*) FROM lex_documents
WHERE (url = :url OR document_number = :number)
AND source = :source
""",
{
"url": doc.url,
"number": doc.number,
"source": doc.source
}
)
return result.scalar() > 0
async def _log_rejected(self, doc: RawDocument, score: float) -> None:
"""Логирование отклонённого документа."""
async with db_session() as session:
await session.execute(
"""
INSERT INTO lex_statistics
(date, source, action, count, metadata)
VALUES (CURRENT_DATE, :source, 'rejected', 1, :metadata)
ON CONFLICT (date, source, action)
DO UPDATE SET count = lex_statistics.count + 1
""",
{
"source": doc.source,
"metadata": {"url": doc.url, "score": score}
}
)
await session.commit()
async def _save_document(self, **kwargs) -> int:
"""Сохранение документа в БД."""
async with db_session() as session:
result = await session.execute(
"""
INSERT INTO lex_documents (
source, url, title, document_number, document_date,
effective_date, doc_type, category, relevance_level,
relevance_score, summary, kb_id, created_at
) VALUES (
:source, :url, :title, :number, :date,
:effective_date, :doc_type, :category, :relevance_level,
:relevance_score, :summary, :kb_id, :created_at
) RETURNING id
""",
{
"source": kwargs["raw_document"].source,
"url": kwargs["raw_document"].url,
"title": kwargs["raw_document"].title,
"number": kwargs["raw_document"].number,
"date": kwargs["raw_document"].date,
"effective_date": kwargs["raw_document"].effective_date,
"doc_type": kwargs["classification"].doc_type,
"category": kwargs["classification"].category,
"relevance_level": kwargs["classification"].relevance_level,
"relevance_score": kwargs["relevance_score"],
"summary": kwargs["summary"].full_summary,
"kb_id": kwargs["kb_id"],
"created_at": kwargs["raw_document"].parsed_at
}
)
await session.commit()
return result.scalar()