From 42fcc0eb16716f8b1ed06d7184b4259fa7f192f0 Mon Sep 17 00:00:00 2001 From: Arxip222 Date: Wed, 24 Dec 2025 16:17:50 +0300 Subject: [PATCH] =?UTF-8?q?=D0=97=D0=B0=D0=B1=D1=8B=D0=BB=20=D0=B1=D0=B0?= =?UTF-8?q?=D1=82=D1=87=D0=B8=D1=82=D1=8C,=20=D1=82=D0=B5=D0=BF=D0=B5?= =?UTF-8?q?=D1=80=D1=8C=20=D0=BE=D0=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/src/application/services/rag_service.py | 15 ++++++++++++--- backend/src/application/services/text_splitter.py | 5 +++++ .../repositories/qdrant/vector_repository.py | 10 +++++++++- .../telegram/handlers/question_handler.py | 2 -- 4 files changed, 26 insertions(+), 6 deletions(-) diff --git a/backend/src/application/services/rag_service.py b/backend/src/application/services/rag_service.py index 439b3f5..eb65638 100644 --- a/backend/src/application/services/rag_service.py +++ b/backend/src/application/services/rag_service.py @@ -29,7 +29,7 @@ class RAGService: self.splitter = splitter or TextSplitter() async def index_document(self, document: Document) -> list[DocumentChunk]: - chunks_text = self.splitter.split(document.content) + chunks_text = self.splitter.split(document.content) chunks: list[DocumentChunk] = [] for idx, text in enumerate(chunks_text): chunks.append( @@ -42,9 +42,18 @@ class RAGService: ) ) - embeddings = self.embedding_service.embed_texts([c.content for c in chunks]) + EMBEDDING_BATCH_SIZE = 50 + all_embeddings: list[list[float]] = [] + + for i in range(0, len(chunks), EMBEDDING_BATCH_SIZE): + batch_chunks = chunks[i:i + EMBEDDING_BATCH_SIZE] + batch_texts = [c.content for c in batch_chunks] + batch_embeddings = self.embedding_service.embed_texts(batch_texts) + all_embeddings.extend(batch_embeddings) + + print(f"Created {len(all_embeddings)} embeddings, upserting to Qdrant...") await self.vector_repository.upsert_chunks( - chunks, embeddings, model_version=self.embedding_service.model_version() + chunks, all_embeddings, model_version=self.embedding_service.model_version() ) return chunks diff --git a/backend/src/application/services/text_splitter.py b/backend/src/application/services/text_splitter.py index f4410f2..06519ea 100644 --- a/backend/src/application/services/text_splitter.py +++ b/backend/src/application/services/text_splitter.py @@ -39,5 +39,10 @@ class TextSplitter: def _split_sentences(self, text: str) -> Iterable[str]: parts = re.split(r"(?<=[\.\?\!])\s+", text) + if len(parts) == 1 and len(text) > self.chunk_size * 2: + chunk_text = [] + for i in range(0, len(text), self.chunk_size): + chunk_text.append(text[i:i + self.chunk_size]) + return chunk_text return [p.strip() for p in parts if p.strip()] diff --git a/backend/src/infrastructure/repositories/qdrant/vector_repository.py b/backend/src/infrastructure/repositories/qdrant/vector_repository.py index 373f533..a03ddf1 100644 --- a/backend/src/infrastructure/repositories/qdrant/vector_repository.py +++ b/backend/src/infrastructure/repositories/qdrant/vector_repository.py @@ -36,6 +36,8 @@ class QdrantVectorRepository(IVectorRepository): embeddings: Sequence[list[float]], model_version: str, ) -> None: + BATCH_SIZE = 100 + points = [] for chunk, vector in zip(chunks, embeddings): points.append( @@ -52,7 +54,13 @@ class QdrantVectorRepository(IVectorRepository): }, ) ) - self.client.upsert(collection_name=self.collection_name, points=points) + + if len(points) >= BATCH_SIZE: + self.client.upsert(collection_name=self.collection_name, points=points) + points = [] + + if points: + self.client.upsert(collection_name=self.collection_name, points=points) async def search( self, diff --git a/tg_bot/infrastructure/telegram/handlers/question_handler.py b/tg_bot/infrastructure/telegram/handlers/question_handler.py index 1efc778..43950e1 100644 --- a/tg_bot/infrastructure/telegram/handlers/question_handler.py +++ b/tg_bot/infrastructure/telegram/handlers/question_handler.py @@ -84,7 +84,6 @@ async def process_premium_question(message: Message, user: User, question_text: try: from urllib.parse import unquote decoded = unquote(title) - # Если декодирование изменило строку или исходная содержит %XX if decoded != title or '%' in title: title = decoded except: @@ -152,7 +151,6 @@ async def process_free_question(message: Message, user: User, question_text: str try: from urllib.parse import unquote decoded = unquote(title) - # Если декодирование изменило строку или исходная содержит %XX if decoded != title or '%' in title: title = decoded except: