added multi-keyword chunker

emcf · Sep 9, 2024 · 586675b · 586675b
1 parent ccfe50a
commit 586675b
Show file tree

Hide file tree

Showing 4 changed files with 29 additions and 6 deletions.
diff --git a/setup.py b/setup.py
@@ -10,7 +10,7 @@ def read_git_requirements(file):
 
 setup(
     name='thepipe_api',
-    version='1.3.4',
+    version='1.3.7',
     author='Emmett McFarlane',
     author_email='[email protected]',
     description='AI-native extractor, powered by multimodal LLMs.',

diff --git a/thepipe/chunker.py b/thepipe/chunker.py
@@ -99,4 +99,27 @@ def chunk_semantic(chunks: List[Chunk], model_name: str = 'sentence-transformers
                     seen_images.append(image)
         new_chunks.append(Chunk(path=group_path, texts=group_texts, images=group_images))
 
+    return new_chunks
+
+# starts a new chunk any time a word is found
+def chunk_by_keywords(chunks: List[Chunk], keywords: List[str] = ['section']) -> List[Chunk]:
+    new_chunks = []
+    current_chunk_text = ""
+    current_chunk_images = []
+    current_chunk_path = chunks[0].path
+    for chunk in chunks:
+        chunk_text = '\n'.join(chunk.texts)
+        chunk_images = chunk.images
+        lines = chunk_text.split('\n')
+        for line in lines:
+            if any(keyword.lower() in line.lower() for keyword in keywords):
+                if current_chunk_text:
+                    new_chunks.append(Chunk(path=chunk.path, texts=[current_chunk_text], images=current_chunk_images))
+                    current_chunk_text = ""
+                    current_chunk_images = chunk_images
+                    current_chunk_path = chunk.path
+            current_chunk_text += line + '\n'
+        current_chunk_images.extend(chunk_images)
+    if current_chunk_text:
+        new_chunks.append(Chunk(path=current_chunk_path, texts=[current_chunk_text], images=current_chunk_images))
     return new_chunks
diff --git a/thepipe/extract.py b/thepipe/extract.py
@@ -2,9 +2,9 @@
 import json
 import re
 from typing import List, Dict, Union, Optional, Tuple, Callable
-from thepipe.core import HOST_URL, THEPIPE_API_KEY, Chunk, calculate_tokens
-from thepipe.scraper import scrape_url, scrape_file
-from thepipe.chunker import chunk_by_page
+from .core import HOST_URL, THEPIPE_API_KEY, Chunk, calculate_tokens
+from .scraper import scrape_url, scrape_file
+from .chunker import chunk_by_page, chunk_by_document, chunk_by_section, chunk_semantic, chunk_by_keywords
 import requests
 import os
 from openai import OpenAI
@@ -98,7 +98,7 @@ def extract_from_chunk(chunk: Chunk, chunk_index: int, schema: str, ai_model: st
                     if isinstance(llm_response_dict, dict):
                         response_dict.update(llm_response_dict)
                     elif isinstance(llm_response_dict, list):
-                        response_dict["error"] = f"Expected a single JSON object but received a list: {llm_response_dict}"
+                        response_dict["error"] = f"Expected a single JSON object but received a list: {llm_response_dict}. Try enabling multiple extractions."
                     else:
                         response_dict["error"] = f"Invalid JSON structure in LLM response: {llm_response_dict}"
             else:

diff --git a/thepipe/scraper.py b/thepipe/scraper.py
@@ -14,7 +14,7 @@
 import requests
 import json
 from .core import HOST_URL, THEPIPE_API_KEY, HOST_IMAGES, Chunk, make_image_url
-from .chunker import chunk_by_page
+from .chunker import chunk_by_page, chunk_by_document, chunk_by_section, chunk_semantic, chunk_by_keywords
 import tempfile
 import mimetypes
 import dotenv