Merge pull request #254 from huggingface/python_2

Adding OpenAI GPT and Transformer-XL models, compatibility with Python 2
huggingface · Feb 11, 2019 · 03cdb2a · 03cdb2a
2 parents 2dfaf2f + 1e71f11
commit 03cdb2a
Show file tree

Hide file tree

Showing 30 changed files with 5,728 additions and 298 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -1,11 +1,29 @@
 version: 2
 jobs:
-    build:
+    build_py3:
         working_directory: ~/pytorch-pretrained-BERT
         docker:
-            - image: circleci/python:3.7
+            - image: circleci/python:3.5
         steps:
             - checkout
             - run: sudo pip install --progress-bar off .
-            - run: sudo pip install pytest
+            - run: sudo pip install pytest ftfy spacy
+            - run: sudo python -m spacy download en
             - run: python -m pytest -sv tests/
+    build_py2:
+        working_directory: ~/pytorch-pretrained-BERT
+        docker:
+            - image: circleci/python:2.7
+        steps:
+            - checkout
+            - run: sudo pip install --progress-bar off .
+            - run: sudo pip install pytest spacy
+            - run: sudo pip install ftfy==4.4.3
+            - run: sudo python -m spacy download en
+            - run: python -m pytest -sv tests/
+workflows:
+  version: 2
+  build_and_test:
+    jobs:
+      - build_py3
+      - build_py2
diff --git a/README.md b/README.md
diff --git a/examples/extract_features.py b/examples/extract_features.py
@@ -80,10 +80,10 @@ def convert_examples_to_features(examples, seq_length, tokenizer):
         # The convention in BERT is:
         # (a) For sequence pairs:
         #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
+        #  type_ids:   0   0  0    0    0     0      0   0    1  1  1   1  1   1
         # (b) For single sequences:
         #  tokens:   [CLS] the dog is hairy . [SEP]
-        #  type_ids: 0   0   0   0  0     0 0
+        #  type_ids:   0   0   0   0  0     0   0
         #
         # Where "type_ids" are used to indicate whether this is the first
         # sequence or the second sequence. The embedding vectors for `type=0` and

diff --git a/examples/run_classifier.py b/examples/run_classifier.py
@@ -15,26 +15,26 @@
 # limitations under the License.
 """BERT finetuning runner."""
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
+import argparse
 import csv
-import os
 import logging
-import argparse
+import os
 import random
-from tqdm import tqdm, trange
+import sys
 
 import numpy as np
 import torch
-from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                              TensorDataset)
 from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange
 
+from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
+from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME
 from pytorch_pretrained_bert.tokenization import BertTokenizer
-from pytorch_pretrained_bert.modeling import BertForSequenceClassification
 from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
-from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                     datefmt = '%m/%d/%Y %H:%M:%S',
@@ -91,10 +91,12 @@ def get_labels(self):
     @classmethod
     def _read_tsv(cls, input_file, quotechar=None):
         """Reads a tab separated value file."""
-        with open(input_file, "r", encoding='utf-8') as f:
+        with open(input_file, "r") as f:
             reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
             lines = []
             for line in reader:
+                if sys.version_info[0] == 2:
+                    line = list(unicode(cell, 'utf-8') for cell in line)
                 lines.append(line)
             return lines
 
@@ -321,6 +323,10 @@ def main():
                         help="The output directory where the model predictions and checkpoints will be written.")
 
     ## Other parameters
+    parser.add_argument("--cache_dir",
+                        default="",
+                        type=str,
+                        help="Where do you want to store the pre-trained models downloaded from s3")
     parser.add_argument("--max_seq_length",
                         default=128,
                         type=int,
@@ -380,9 +386,17 @@ def main():
                         help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
                              "0 (default value): dynamic loss scaling.\n"
                              "Positive power of 2: static loss scaling value.\n")
-
+    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
     args = parser.parse_args()
 
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
     processors = {
         "cola": ColaProcessor,
         "mnli": MnliProcessor,
@@ -424,7 +438,8 @@ def main():
 
     if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
         raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
-    os.makedirs(args.output_dir, exist_ok=True)
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
 
     task_name = args.task_name.lower()
 
@@ -447,8 +462,9 @@ def main():
             num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
 
     # Prepare model
+    cache_dir = args.cache_dir if args.cache_dir else os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank))
     model = BertForSequenceClassification.from_pretrained(args.bert_model,
-              cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank),
+              cache_dir=cache_dir,
               num_labels = num_labels)
     if args.fp16:
         model.half()
@@ -545,15 +561,21 @@ def main():
                     optimizer.zero_grad()
                     global_step += 1
 
-    # Save a trained model
-    model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
-    output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
     if args.do_train:
+        # Save a trained model and the associated configuration
+        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
         torch.save(model_to_save.state_dict(), output_model_file)
-
-    # Load a trained model that you have fine-tuned
-    model_state_dict = torch.load(output_model_file)
-    model = BertForSequenceClassification.from_pretrained(args.bert_model, state_dict=model_state_dict, num_labels=num_labels)
+        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
+        with open(output_config_file, 'w') as f:
+            f.write(model_to_save.config.to_json_string())
+
+        # Load a trained model and config that you have fine-tuned
+        config = BertConfig(output_config_file)
+        model = BertForSequenceClassification(config, num_labels=num_labels)
+        model.load_state_dict(torch.load(output_model_file))
+    else:
+        model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
     model.to(device)
 
     if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):

diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py
@@ -15,22 +15,22 @@
 # limitations under the License.
 """BERT finetuning runner."""
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function, unicode_literals
 
-import os
-import logging
 import argparse
-from tqdm import tqdm, trange
+import logging
+import os
+import random
+from io import open
 
 import numpy as np
 import torch
-from torch.utils.data import DataLoader, RandomSampler
+from torch.utils.data import DataLoader, Dataset, RandomSampler
 from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange
 
-from pytorch_pretrained_bert.tokenization import BertTokenizer
 from pytorch_pretrained_bert.modeling import BertForPreTraining
+from pytorch_pretrained_bert.tokenization import BertTokenizer
 from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
 
 from torch.utils.data import Dataset
@@ -179,16 +179,16 @@ def get_corpus_line(self, item):
             if self.line_buffer is None:
                 # read first non-empty line of file
                 while t1 == "" :
-                    t1 = self.file.__next__().strip()
-                    t2 = self.file.__next__().strip()
+                    t1 = next(self.file).strip()
+                    t2 = next(self.file).strip()
             else:
                 # use t2 from previous iteration as new t1
                 t1 = self.line_buffer
-                t2 = self.file.__next__().strip()
+                t2 = next(self.file).strip()
                 # skip empty rows that are used for separating documents and keep track of current doc id
                 while t2 == "" or t1 == "":
-                    t1 = self.file.__next__().strip()
-                    t2 = self.file.__next__().strip()
+                    t1 = next(self.file).strip()
+                    t2 = next(self.file).strip()
                     self.current_doc = self.current_doc+1
             self.line_buffer = t2
 
@@ -222,15 +222,15 @@ def get_random_line(self):
     def get_next_line(self):
         """ Gets next line of random_file and starts over when reaching end of file"""
         try:
-            line = self.random_file.__next__().strip()
+            line = next(self.random_file).strip()
             #keep track of which document we are currently looking at to later avoid having the same doc as t1
             if line == "":
                 self.current_random_doc = self.current_random_doc + 1
-                line = self.random_file.__next__().strip()
+                line = next(self.random_file).strip()
         except StopIteration:
             self.random_file.close()
             self.random_file = open(self.corpus_path, "r", encoding=self.encoding)
-            line = self.random_file.__next__().strip()
+            line = next(self.random_file).strip()
         return line
 
 
@@ -419,6 +419,7 @@ def main():
                         help="The output directory where the model checkpoints will be written.")
 
     ## Other parameters
+    parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.")
     parser.add_argument("--max_seq_length",
                         default=128,
                         type=int,
@@ -506,7 +507,8 @@ def main():
 
     if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
         raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
-    os.makedirs(args.output_dir, exist_ok=True)
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
 
     tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
 
@@ -575,7 +577,7 @@ def main():
         if args.local_rank == -1:
             train_sampler = RandomSampler(train_dataset)
         else:
-            #TODO: check if this works with current data generator from disk that relies on file.__next__
+            #TODO: check if this works with current data generator from disk that relies on next(file)
             # (it doesn't return item back by index)
             train_sampler = DistributedSampler(train_dataset)
         train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
@@ -641,4 +643,4 @@ def accuracy(out, labels):
 
 
 if __name__ == "__main__":
-    main()
+    main()