From e71a01a104dd663c730e494eb0b6467bb51df357 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Thu, 26 Sep 2024 19:55:32 +0200 Subject: [PATCH] manually fix PLBart tokenizer --- src/transformers/models/plbart/tokenization_plbart.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/transformers/models/plbart/tokenization_plbart.py b/src/transformers/models/plbart/tokenization_plbart.py index 9ab2e33f7f0dba..f9648924c8e0fa 100644 --- a/src/transformers/models/plbart/tokenization_plbart.py +++ b/src/transformers/models/plbart/tokenization_plbart.py @@ -130,6 +130,7 @@ def __init__( tgt_lang=None, sp_model_kwargs: Optional[Dict[str, Any]] = None, additional_special_tokens=None, + clean_up_tokenization_spaces=True, **kwargs, ): # Mask token behave like a normal word, i.e. include the space before it @@ -200,6 +201,7 @@ def __init__( tgt_lang=tgt_lang, additional_special_tokens=_additional_special_tokens, sp_model_kwargs=self.sp_model_kwargs, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs, )