diff --git a/.gitignore b/.gitignore index 7f0de2b..6359352 100644 --- a/.gitignore +++ b/.gitignore @@ -52,6 +52,7 @@ coverage.xml .hypothesis/ .pytest_cache/ cover/ +tests/out # Translations *.mo diff --git a/pyproject.toml b/pyproject.toml index 3e14cec..96d35cc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,7 @@ dependencies = [ "pandas", "openpyxl", "pdfminer.six", + "pymupdf4llm", "puremagic", "pydub", "youtube-transcript-api", diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 789c1e5..8b1daf8 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -14,7 +14,7 @@ import traceback import zipfile from xml.dom import minidom -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union, Literal, Mapping from pathlib import Path from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse from warnings import warn, resetwarnings, catch_warnings @@ -24,6 +24,7 @@ import pandas as pd import pdfminer import pdfminer.high_level +import pymupdf4llm import pptx # File-format detection @@ -676,19 +677,38 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: class PdfConverter(DocumentConverter): """ - Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text. + Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text. """ + _engines: Mapping[str, Any] = { + "pdfminer": pdfminer.high_level.extract_text, + "pymupdf4llm": pymupdf4llm.to_markdown, + } - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + def convert( + self, + local_path, + engine: Literal["pdfminer", "pymupdf4llm"] = "pdfminer", + engine_kwargs={}, + **kwargs, + ) -> Union[None, DocumentConverterResult]: + """ + Example: + >>> source = "https://arxiv.org/pdf/2308.08155v2.pdf" + >>> markitdown.convert(source, engine="pymupdf4llm") + """ # Bail if not a PDF extension = kwargs.get("file_extension", "") if extension.lower() != ".pdf": return None - - return DocumentConverterResult( - title=None, - text_content=pdfminer.high_level.extract_text(local_path), - ) + if engine is not None and engine not in self._engines: + raise FileConversionException( + "'engine' not valid for {} files. Please choose between {}.".format( + extension, list(self._engines.keys()) + ) + ) + else: + text_content = self._engines[engine](local_path, **engine_kwargs) + return DocumentConverterResult(title=None, text_content=text_content) class DocxConverter(HtmlConverter): diff --git a/tests/test_files/2308.08155v2.pdf b/tests/test_files/2308.08155v2.pdf new file mode 100644 index 0000000..fffb9dd Binary files /dev/null and b/tests/test_files/2308.08155v2.pdf differ diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 4a981bd..d49cd92 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -7,7 +7,6 @@ import requests from warnings import catch_warnings, resetwarnings - from markitdown import MarkItDown skip_remote = ( @@ -299,6 +298,42 @@ def test_markitdown_llm() -> None: for test_string in ["red", "circle", "blue", "square"]: assert test_string in result.text_content.lower() +def test_markitdown_pdf() -> None: + markitdown = MarkItDown() + + # I test by local pdf, using PDF_TEST_URL may also be fine. + + # By pymupdf4llm + result = markitdown.convert( + os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"), + engine="pymupdf4llm", + + engine_kwargs={"show_progress": False, "pages": range(10),}, # additional kwargs + ) + for test_string in PDF_TEST_STRINGS: + assert test_string in result.text_content + + # By pymupdf4llm and extract images + result = markitdown.convert( + os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"), + engine="pymupdf4llm", + engine_kwargs={ + "show_progress": False, + "write_images": True, + "image_path": "tests/out", + "pages": range(10), + }, # `write_images` must be True, setting `image_path` for images saving dir. + ) + for test_string in PDF_TEST_STRINGS: + assert test_string in result.text_content + + # By pdfminer + result = markitdown.convert( + os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"), engine="pdfminer", + enging_kwargs={"page_numbers": range(10),} + ) + for test_string in PDF_TEST_STRINGS: + assert test_string in result.text_content if __name__ == "__main__": """Runs this file's tests from the command line.""" @@ -307,3 +342,4 @@ def test_markitdown_llm() -> None: test_markitdown_exiftool() test_markitdown_deprecation() test_markitdown_llm() + test_markitdown_pdf()