Skip to content

Commit

Permalink
added json block formatting removal
Browse files Browse the repository at this point in the history
  • Loading branch information
emcf committed Sep 4, 2024
1 parent 2e52ff2 commit e69a00a
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 5 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def read_git_requirements(file):

setup(
name='thepipe_api',
version='1.2.5',
version='1.2.6',
author='Emmett McFarlane',
author_email='[email protected]',
description='AI-native extractor, powered by multimodal LLMs.',
Expand Down
2 changes: 1 addition & 1 deletion tests/test_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def test_extract_json_from_response(self):

def test_extract(self):
results, total_tokens_used = extract(
chunks=self.chunks,
chunks=self.chunks, # receipt
schema=self.schema,
)

Expand Down
13 changes: 10 additions & 3 deletions thepipe/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,21 @@

def extract_json_from_response(llm_response: str) -> Union[Dict, List[Dict], None]:
def clean_response_text(llm_response: str) -> str:
return llm_response.encode('utf-8', 'ignore').decode('utf-8')
return llm_response.encode('utf-8', 'ignore').decode('utf-8').strip()

llm_response = llm_response.strip()
# try to match inside of code block
code_block_pattern = r'^```(?:json)?\s*([\s\S]*?)\s*```$'
match = re.match(code_block_pattern, llm_response, re.MULTILINE | re.DOTALL)
if match:
llm_response = match.group(1).strip()
llm_response = match.group(1)
llm_response = clean_response_text(llm_response)

# try to remove code block formatting if still present
if llm_response.startswith("```json") and llm_response.endswith("```"):
llm_response = llm_response[len("```json"):-len("```")]
llm_response = clean_response_text(llm_response)

# parse json by matching curly braces
try:
parsed_json = json.loads(llm_response)
return parsed_json
Expand Down

0 comments on commit e69a00a

Please sign in to comment.