From 42fcc0eb16716f8b1ed06d7184b4259fa7f192f0 Mon Sep 17 00:00:00 2001
From: Arxip222 <arhipbirukov@gmail.com>
Date: Wed, 24 Dec 2025 16:17:50 +0300
Subject: [PATCH] =?UTF-8?q?=D0=97=D0=B0=D0=B1=D1=8B=D0=BB=20=D0=B1=D0=B0?=
 =?UTF-8?q?=D1=82=D1=87=D0=B8=D1=82=D1=8C,=20=D1=82=D0=B5=D0=BF=D0=B5?=
 =?UTF-8?q?=D1=80=D1=8C=20=D0=BE=D0=BA?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/src/application/services/rag_service.py   | 15 ++++++++++++---
 backend/src/application/services/text_splitter.py |  5 +++++
 .../repositories/qdrant/vector_repository.py      | 10 +++++++++-
 .../telegram/handlers/question_handler.py         |  2 --
 4 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/backend/src/application/services/rag_service.py b/backend/src/application/services/rag_service.py
index 439b3f5..eb65638 100644
--- a/backend/src/application/services/rag_service.py
+++ b/backend/src/application/services/rag_service.py
@@ -29,7 +29,7 @@ class RAGService:
         self.splitter = splitter or TextSplitter()
 
     async def index_document(self, document: Document) -> list[DocumentChunk]:
-        chunks_text = self.splitter.split(document.content)
+        chunks_text = self.splitter.split(document.content)        
         chunks: list[DocumentChunk] = []
         for idx, text in enumerate(chunks_text):
             chunks.append(
@@ -42,9 +42,18 @@ class RAGService:
                 )
             )
 
-        embeddings = self.embedding_service.embed_texts([c.content for c in chunks])
+        EMBEDDING_BATCH_SIZE = 50
+        all_embeddings: list[list[float]] = []
+        
+        for i in range(0, len(chunks), EMBEDDING_BATCH_SIZE):
+            batch_chunks = chunks[i:i + EMBEDDING_BATCH_SIZE]
+            batch_texts = [c.content for c in batch_chunks]
+            batch_embeddings = self.embedding_service.embed_texts(batch_texts)
+            all_embeddings.extend(batch_embeddings)
+        
+        print(f"Created {len(all_embeddings)} embeddings, upserting to Qdrant...")
         await self.vector_repository.upsert_chunks(
-            chunks, embeddings, model_version=self.embedding_service.model_version()
+            chunks, all_embeddings, model_version=self.embedding_service.model_version()
         )
         return chunks
 
diff --git a/backend/src/application/services/text_splitter.py b/backend/src/application/services/text_splitter.py
index f4410f2..06519ea 100644
--- a/backend/src/application/services/text_splitter.py
+++ b/backend/src/application/services/text_splitter.py
@@ -39,5 +39,10 @@ class TextSplitter:
 
     def _split_sentences(self, text: str) -> Iterable[str]:
         parts = re.split(r"(?<=[\.\?\!])\s+", text)
+        if len(parts) == 1 and len(text) > self.chunk_size * 2:
+            chunk_text = []
+            for i in range(0, len(text), self.chunk_size):
+                chunk_text.append(text[i:i + self.chunk_size])
+            return chunk_text
         return [p.strip() for p in parts if p.strip()]
 
diff --git a/backend/src/infrastructure/repositories/qdrant/vector_repository.py b/backend/src/infrastructure/repositories/qdrant/vector_repository.py
index 373f533..a03ddf1 100644
--- a/backend/src/infrastructure/repositories/qdrant/vector_repository.py
+++ b/backend/src/infrastructure/repositories/qdrant/vector_repository.py
@@ -36,6 +36,8 @@ class QdrantVectorRepository(IVectorRepository):
         embeddings: Sequence[list[float]],
         model_version: str,
     ) -> None:
+        BATCH_SIZE = 100
+        
         points = []
         for chunk, vector in zip(chunks, embeddings):
             points.append(
@@ -52,7 +54,13 @@ class QdrantVectorRepository(IVectorRepository):
                     },
                 )
             )
-        self.client.upsert(collection_name=self.collection_name, points=points)
+            
+            if len(points) >= BATCH_SIZE:
+                self.client.upsert(collection_name=self.collection_name, points=points)
+                points = []
+        
+        if points:
+            self.client.upsert(collection_name=self.collection_name, points=points)
 
     async def search(
         self,
diff --git a/tg_bot/infrastructure/telegram/handlers/question_handler.py b/tg_bot/infrastructure/telegram/handlers/question_handler.py
index 1efc778..43950e1 100644
--- a/tg_bot/infrastructure/telegram/handlers/question_handler.py
+++ b/tg_bot/infrastructure/telegram/handlers/question_handler.py
@@ -84,7 +84,6 @@ async def process_premium_question(message: Message, user: User, question_text:
                 try:
                     from urllib.parse import unquote
                     decoded = unquote(title)
-                    # Если декодирование изменило строку или исходная содержит %XX
                     if decoded != title or '%' in title:
                         title = decoded
                 except:
@@ -152,7 +151,6 @@ async def process_free_question(message: Message, user: User, question_text: str
                 try:
                     from urllib.parse import unquote
                     decoded = unquote(title)
-                    # Если декодирование изменило строку или исходная содержит %XX
                     if decoded != title or '%' in title:
                         title = decoded
                 except: