Skip to content

Commit

Permalink
feat:using nltk assets in docker image
Browse files Browse the repository at this point in the history
  • Loading branch information
christinestraub committed Dec 26, 2024
1 parent 3c7985f commit 8ee6a2b
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 63 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.16.12-dev6
## 0.16.12-dev7

### Enhancements

Expand All @@ -15,6 +15,7 @@
- **CSV with asserted XLS content-type is correctly identified as CSV.** Resolves a bug where a CSV file with an asserted content-type of `application/vnd.ms-excel` was incorrectly identified as an XLS file.
- **Improve element-type mapping for Chinese text.** Fixes bug where Chinese text would produce large numbers of false-positive `Title` elements.
- **Improve element-type mapping for HTML.** Fixes bug where certain non-title elements were classified as `Title`.
- **Fix NLTK Download** to use nltk assets in docker image

## 0.16.11

Expand Down
19 changes: 7 additions & 12 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,27 +10,22 @@ COPY test_unstructured test_unstructured
COPY example-docs example-docs

RUN chown -R notebook-user:notebook-user /app && \
apk add --no-cache font-ubuntu git && \
fc-cache -fv && \
if [ "$(readlink -f /usr/bin/python3)" != "/usr/bin/python3.11" ]; then \
ln -sf /usr/bin/python3.11 /usr/bin/python3; \
fi
apk add font-ubuntu git && \
fc-cache -fv && \
ln -s /usr/bin/python3.11 /usr/bin/python3

USER notebook-user

RUN find requirements/ -type f -name "*.txt" -exec pip3.11 install --no-cache-dir --user -r '{}' \;
RUN find requirements/ -type f -name "*.txt" -exec pip3.11 install --no-cache-dir --user -r '{}' ';'

RUN python3.11 -c "import os; os.makedirs('/home/notebook-user/nltk_data', exist_ok=True)" && \
python3.11 -c "from nltk.downloader import download; download('punkt_tab'); download('averaged_perceptron_tagger_eng')"

RUN python3.11 -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()"

RUN python3.11 -c "from unstructured.partition.model_init import initialize; initialize()"

RUN python3.11 -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; \
model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')"
RUN python3.11 -c "from unstructured.partition.model_init import initialize; initialize()" && \
python3.11 -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')"

ENV PATH="${PATH}:/home/notebook-user/.local/bin"
ENV TESSDATA_PREFIX=/usr/local/share/tessdata
ENV NLTK_DATA=/home/notebook-user/nltk_data

CMD ["/bin/bash"]
21 changes: 4 additions & 17 deletions test_unstructured/nlp/test_tokenize.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,14 @@
from typing import List, Tuple
from unittest.mock import patch

import nltk

from test_unstructured.nlp.mock_nltk import mock_sent_tokenize, mock_word_tokenize
from unstructured.nlp import tokenize


def test_nltk_packages_download_if_not_present():
tokenize._download_nltk_packages_if_not_present.cache_clear()
with patch.object(nltk, "find", side_effect=LookupError):
with patch.object(tokenize, "download_nltk_packages") as mock_download:
tokenize._download_nltk_packages_if_not_present()

mock_download.assert_called_once()


def test_nltk_packages_do_not_download_if():
tokenize._download_nltk_packages_if_not_present.cache_clear()
with patch.object(nltk, "find"), patch.object(nltk, "download") as mock_download:
tokenize._download_nltk_packages_if_not_present()

mock_download.assert_not_called()
def test_nltk_assets_validation():
with patch("unstructured.nlp.tokenize.validate_nltk_assets") as mock_validate:
tokenize.validate_nltk_assets()
mock_validate.assert_called_once()


def mock_pos_tag(tokens: List[str]) -> List[Tuple[str, str]]:
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.16.12-dev6" # pragma: no cover
__version__ = "0.16.12-dev7" # pragma: no cover
54 changes: 22 additions & 32 deletions unstructured/nlp/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,66 +11,56 @@

CACHE_MAX_SIZE: Final[int] = 128


def download_nltk_packages():
nltk.download("averaged_perceptron_tagger_eng", quiet=True)
nltk.download("punkt_tab", quiet=True)
# Define the NLTK data path based on the Docker image environment
NLTK_DATA_PATH = os.getenv("NLTK_DATA", "/home/notebook-user/nltk_data")
nltk.data.path.append(NLTK_DATA_PATH)


def check_for_nltk_package(package_name: str, package_category: str) -> bool:
"""Checks to see if the specified NLTK package exists on the file system"""
paths: list[str] = []
for path in nltk.data.path:
if not path.endswith("nltk_data"):
path = os.path.join(path, "nltk_data")
paths.append(path)

"""Checks to see if the specified NLTK package exists on the file system."""
try:
nltk.find(f"{package_category}/{package_name}", paths=paths)
nltk.find(f"{package_category}/{package_name}")
return True
except (LookupError, OSError):
return False


# We cache this because we do not want to attempt
# downloading the packages multiple times
@lru_cache()
def _download_nltk_packages_if_not_present():
"""If required NLTK packages are not available, download them."""
# Ensure NLTK data exists in the specified path (pre-baked in Docker)
def validate_nltk_assets():
"""Validate that required NLTK packages are preloaded in the image."""
required_assets = [
("punkt_tab", "tokenizers"),
("averaged_perceptron_tagger_eng", "taggers"),
]
for package_name, category in required_assets:
if not check_for_nltk_package(package_name, category):
raise RuntimeError(
f"Required NLTK package '{package_name}' is missing. "
f"Ensure it is baked into the Docker image at '{NLTK_DATA_PATH}'."
)

tagger_available = check_for_nltk_package(
package_category="taggers",
package_name="averaged_perceptron_tagger_eng",
)
tokenizer_available = check_for_nltk_package(
package_category="tokenizers", package_name="punkt_tab"
)

if (not tokenizer_available) or (not tagger_available):
download_nltk_packages()
# Validate NLTK assets at import time
validate_nltk_assets()


@lru_cache(maxsize=CACHE_MAX_SIZE)
def sent_tokenize(text: str) -> List[str]:
"""A wrapper around the NLTK sentence tokenizer with LRU caching enabled."""
_download_nltk_packages_if_not_present()
return _sent_tokenize(text)


@lru_cache(maxsize=CACHE_MAX_SIZE)
def word_tokenize(text: str) -> List[str]:
"""A wrapper around the NLTK word tokenizer with LRU caching enabled."""
_download_nltk_packages_if_not_present()
return _word_tokenize(text)


@lru_cache(maxsize=CACHE_MAX_SIZE)
def pos_tag(text: str) -> List[Tuple[str, str]]:
"""A wrapper around the NLTK POS tagger with LRU caching enabled."""
_download_nltk_packages_if_not_present()
# NOTE(robinson) - Splitting into sentences before tokenizing. The helps with
# situations like "ITEM 1A. PROPERTIES" where "PROPERTIES" can be mistaken
# for a verb because it looks like it's in verb form an "ITEM 1A." looks like the subject.
# NOTE: Splitting into sentences before tokenizing helps with situations
# like "ITEM 1A. PROPERTIES" where tokens can be misinterpreted.
sentences = _sent_tokenize(text)
parts_of_speech: list[tuple[str, str]] = []
for sentence in sentences:
Expand Down

0 comments on commit 8ee6a2b

Please sign in to comment.