Dependency Matcher freezes when large files are loaded in memory #12360
-
How to reproduce the behaviourI have a spacy pipeline with many components. Many of those components use data from dict. (this is important) import json
from copy import deepcopy
from datetime import datetime as dt
import spacy
from spacy.language import Language
from spacy.matcher import DependencyMatcher
with open("large-file.json") as f:
test_sample = json.load(f)
test = [deepcopy(test_sample) for i in range(30)] # if you comment this, there is no freeze
# test = "a" * 2516582400 * 2 # if you load this in memory instead of the json it does not freeze either
class Matching:
def __init__(self, nlp):
self.nlp = nlp
self.matcher = DependencyMatcher(nlp.vocab, validate=True)
patterns = {
0: [
[
{
"RIGHT_ID": "head",
"RIGHT_ATTRS": {"POS": {"IN": ["NOUN", "PROPN", "ADJ"]}},
},
{
"LEFT_ID": "head",
"REL_OP": ".",
"RIGHT_ID": "adp",
"RIGHT_ATTRS": {"POS": "ADP"},
},
{
"LEFT_ID": "adp",
"REL_OP": ".",
"RIGHT_ID": "nmod",
"RIGHT_ATTRS": {"POS": {"IN": ["NOUN", "PROPN", "ADJ"]}},
"DEP": {"IN": ["nmod", "amod"]},
},
]
]
}
for name, pattern in patterns.items():
self.matcher.add(name, pattern)
def __call__(self, doc):
t = dt.now()
matches = self.matcher(doc)
t2 = dt.now()
print(f"DONE MATCH -> {(t2-t).total_seconds()}")
return doc
@Language.factory("matching")
def create_matching(nlp, name: str):
"""create component for matching"""
return Matching(nlp)
nlp = spacy.load("fr_core_news_lg")
nlp.add_pipe("matching", before="ner")
text = "Retrouvez dans cet article différents témoignages de blogueurs voyage qui nous parlent des plus belles randonnées qu’ils ont pu faire en France. "
for i, j in enumerate(nlp.pipe((text * 1000 for i in range(100)), batch_size=1)):
del j With this code, the dep matcher takes usualy 0.23s but once in a while (for me at doc 25, 62, 98) it takes ~1.17s . Thank you for reading ! Environment
|
Beta Was this translation helpful? Give feedback.
Replies: 1 comment 3 replies
-
From the description it sounds like your computer is swapping because RAM is full. Keep an eye on the memory usage (I think you can look at "memory" in the Activity Monitor) and see if the delays start when memory usage gets close to 100%? |
Beta Was this translation helpful? Give feedback.
From the description it sounds like your computer is swapping because RAM is full. Keep an eye on the memory usage (I think you can look at "memory" in the Activity Monitor) and see if the delays start when memory usage gets close to 100%?