From 6a994c645030a87ed2234bf8ba58a9240a9cba4d Mon Sep 17 00:00:00 2001 From: Anton Dubovik Date: Fri, 22 Nov 2024 17:33:37 +0000 Subject: [PATCH] feat: added en-core-web-sm as en explicit dependency --- .../pii_anonymiser/spacy_anonymizer.py | 13 ++++++++++-- poetry.lock | 21 +++++++++++++++++-- pyproject.toml | 11 ++++++++-- 3 files changed, 39 insertions(+), 6 deletions(-) diff --git a/aidial_interceptors_sdk/examples/chat_completion/pii_anonymiser/spacy_anonymizer.py b/aidial_interceptors_sdk/examples/chat_completion/pii_anonymiser/spacy_anonymizer.py index 79e5f00..f329e6c 100644 --- a/aidial_interceptors_sdk/examples/chat_completion/pii_anonymiser/spacy_anonymizer.py +++ b/aidial_interceptors_sdk/examples/chat_completion/pii_anonymiser/spacy_anonymizer.py @@ -1,3 +1,4 @@ +import logging import re from collections import defaultdict from functools import cache @@ -22,11 +23,19 @@ "PRODUCT", ] +_log = logging.getLogger(__name__) + @cache def _get_pipeline(model: str) -> Language: - download_model(model) - return load_model(model) + try: + return load_model(model) + except Exception as e: + _log.warning( + f"Failed to load spaCy model {model!r}: {str(e)}\nDownloading the model..." + ) + download_model(model) + return load_model(model) # Preemptively load the default model on the server start-up diff --git a/poetry.lock b/poetry.lock index 8c73004..cca6bb3 100644 --- a/poetry.lock +++ b/poetry.lock @@ -647,6 +647,23 @@ files = [ {file = "distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed"}, ] +[[package]] +name = "en-core-web-sm" +version = "3.7.1" +description = "English pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler, lemmatizer." +optional = true +python-versions = "*" +files = [ + {file = "en_core_web_sm-3.7.1-py3-none-any.whl", hash = "sha256:86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889"}, +] + +[package.dependencies] +spacy = ">=3.7.2,<3.8.0" + +[package.source] +type = "url" +url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl" + [[package]] name = "fastapi" version = "0.115.2" @@ -3008,9 +3025,9 @@ doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linke test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"] [extras] -examples = ["aiostream", "numpy", "pillow", "spacy"] +examples = ["aiostream", "en_core_web_sm", "numpy", "pillow", "spacy"] [metadata] lock-version = "2.0" python-versions = ">=3.11,<4.0" -content-hash = "da8da7edaaebcc327d3f9155e2206ab9ea070d119ebd5efe1e8fc1fff58bf910" +content-hash = "479014a1cbff10400410cba9f2e0b408d3bf00d5d2e7670223e510f6ac696743" diff --git a/pyproject.toml b/pyproject.toml index e1a89ea..01b6ea1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,10 +31,17 @@ aidial-sdk = { version = "^0.15.0", extras = ["telemetry"] } aiostream = { version = "^0.6.2", optional = true } pillow = { version = "^10.4.0", optional = true } numpy = { version = "^1.26.1", optional = true } -spacy = { version = "^3.7.5", optional = true } +spacy = { version = "3.7.5", optional = true } + +[tool.poetry.dependencies.en_core_web_sm] +# Pinning particular combination of Spacy and en_core_web_sm versions: +# https://github.com/explosion/spaCy/issues/13690#issuecomment-2487873386 +# otherwise, have a change of running into 403 error in runtime. +url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl" +optional = true [tool.poetry.extras] -examples = ["aiostream", "pillow", "numpy", "spacy"] +examples = ["aiostream", "pillow", "numpy", "spacy", "en_core_web_sm"] [tool.poetry.group.test.dependencies] pytest = "7.4.0"