Skip to content

Commit

Permalink
Merge pull request #254 from huggingface/python_2
Browse files Browse the repository at this point in the history
Adding OpenAI GPT and Transformer-XL models, compatibility with Python 2
  • Loading branch information
thomwolf authored Feb 11, 2019
2 parents 2dfaf2f + 1e71f11 commit 03cdb2a
Show file tree
Hide file tree
Showing 30 changed files with 5,728 additions and 298 deletions.
24 changes: 21 additions & 3 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
@@ -1,11 +1,29 @@
version: 2
jobs:
build:
build_py3:
working_directory: ~/pytorch-pretrained-BERT
docker:
- image: circleci/python:3.7
- image: circleci/python:3.5
steps:
- checkout
- run: sudo pip install --progress-bar off .
- run: sudo pip install pytest
- run: sudo pip install pytest ftfy spacy
- run: sudo python -m spacy download en
- run: python -m pytest -sv tests/
build_py2:
working_directory: ~/pytorch-pretrained-BERT
docker:
- image: circleci/python:2.7
steps:
- checkout
- run: sudo pip install --progress-bar off .
- run: sudo pip install pytest spacy
- run: sudo pip install ftfy==4.4.3
- run: sudo python -m spacy download en
- run: python -m pytest -sv tests/
workflows:
version: 2
build_and_test:
jobs:
- build_py3
- build_py2
481 changes: 449 additions & 32 deletions README.md

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions examples/extract_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,10 @@ def convert_examples_to_features(examples, seq_length, tokenizer):
# The convention in BERT is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
Expand Down
62 changes: 42 additions & 20 deletions examples/run_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,26 +15,26 @@
# limitations under the License.
"""BERT finetuning runner."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import, division, print_function

import argparse
import csv
import os
import logging
import argparse
import os
import random
from tqdm import tqdm, trange
import sys

import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange

from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME
from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.modeling import BertForSequenceClassification
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
Expand Down Expand Up @@ -91,10 +91,12 @@ def get_labels(self):
@classmethod
def _read_tsv(cls, input_file, quotechar=None):
"""Reads a tab separated value file."""
with open(input_file, "r", encoding='utf-8') as f:
with open(input_file, "r") as f:
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
lines = []
for line in reader:
if sys.version_info[0] == 2:
line = list(unicode(cell, 'utf-8') for cell in line)
lines.append(line)
return lines

Expand Down Expand Up @@ -321,6 +323,10 @@ def main():
help="The output directory where the model predictions and checkpoints will be written.")

## Other parameters
parser.add_argument("--cache_dir",
default="",
type=str,
help="Where do you want to store the pre-trained models downloaded from s3")
parser.add_argument("--max_seq_length",
default=128,
type=int,
Expand Down Expand Up @@ -380,9 +386,17 @@ def main():
help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
"0 (default value): dynamic loss scaling.\n"
"Positive power of 2: static loss scaling value.\n")

parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
args = parser.parse_args()

if args.server_ip and args.server_port:
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
import ptvsd
print("Waiting for debugger attach")
ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
ptvsd.wait_for_attach()

processors = {
"cola": ColaProcessor,
"mnli": MnliProcessor,
Expand Down Expand Up @@ -424,7 +438,8 @@ def main():

if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
os.makedirs(args.output_dir, exist_ok=True)
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)

task_name = args.task_name.lower()

Expand All @@ -447,8 +462,9 @@ def main():
num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()

# Prepare model
cache_dir = args.cache_dir if args.cache_dir else os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank))
model = BertForSequenceClassification.from_pretrained(args.bert_model,
cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank),
cache_dir=cache_dir,
num_labels = num_labels)
if args.fp16:
model.half()
Expand Down Expand Up @@ -545,15 +561,21 @@ def main():
optimizer.zero_grad()
global_step += 1

# Save a trained model
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
if args.do_train:
# Save a trained model and the associated configuration
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
torch.save(model_to_save.state_dict(), output_model_file)

# Load a trained model that you have fine-tuned
model_state_dict = torch.load(output_model_file)
model = BertForSequenceClassification.from_pretrained(args.bert_model, state_dict=model_state_dict, num_labels=num_labels)
output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
with open(output_config_file, 'w') as f:
f.write(model_to_save.config.to_json_string())

# Load a trained model and config that you have fine-tuned
config = BertConfig(output_config_file)
model = BertForSequenceClassification(config, num_labels=num_labels)
model.load_state_dict(torch.load(output_model_file))
else:
model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
model.to(device)

if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
Expand Down
40 changes: 21 additions & 19 deletions examples/run_lm_finetuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,22 @@
# limitations under the License.
"""BERT finetuning runner."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import, division, print_function, unicode_literals

import os
import logging
import argparse
from tqdm import tqdm, trange
import logging
import os
import random
from io import open

import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler
from torch.utils.data import DataLoader, Dataset, RandomSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange

from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.modeling import BertForPreTraining
from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear

from torch.utils.data import Dataset
Expand Down Expand Up @@ -179,16 +179,16 @@ def get_corpus_line(self, item):
if self.line_buffer is None:
# read first non-empty line of file
while t1 == "" :
t1 = self.file.__next__().strip()
t2 = self.file.__next__().strip()
t1 = next(self.file).strip()
t2 = next(self.file).strip()
else:
# use t2 from previous iteration as new t1
t1 = self.line_buffer
t2 = self.file.__next__().strip()
t2 = next(self.file).strip()
# skip empty rows that are used for separating documents and keep track of current doc id
while t2 == "" or t1 == "":
t1 = self.file.__next__().strip()
t2 = self.file.__next__().strip()
t1 = next(self.file).strip()
t2 = next(self.file).strip()
self.current_doc = self.current_doc+1
self.line_buffer = t2

Expand Down Expand Up @@ -222,15 +222,15 @@ def get_random_line(self):
def get_next_line(self):
""" Gets next line of random_file and starts over when reaching end of file"""
try:
line = self.random_file.__next__().strip()
line = next(self.random_file).strip()
#keep track of which document we are currently looking at to later avoid having the same doc as t1
if line == "":
self.current_random_doc = self.current_random_doc + 1
line = self.random_file.__next__().strip()
line = next(self.random_file).strip()
except StopIteration:
self.random_file.close()
self.random_file = open(self.corpus_path, "r", encoding=self.encoding)
line = self.random_file.__next__().strip()
line = next(self.random_file).strip()
return line


Expand Down Expand Up @@ -419,6 +419,7 @@ def main():
help="The output directory where the model checkpoints will be written.")

## Other parameters
parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.")
parser.add_argument("--max_seq_length",
default=128,
type=int,
Expand Down Expand Up @@ -506,7 +507,8 @@ def main():

if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
os.makedirs(args.output_dir, exist_ok=True)
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)

tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

Expand Down Expand Up @@ -575,7 +577,7 @@ def main():
if args.local_rank == -1:
train_sampler = RandomSampler(train_dataset)
else:
#TODO: check if this works with current data generator from disk that relies on file.__next__
#TODO: check if this works with current data generator from disk that relies on next(file)
# (it doesn't return item back by index)
train_sampler = DistributedSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
Expand Down Expand Up @@ -641,4 +643,4 @@ def accuracy(out, labels):


if __name__ == "__main__":
main()
main()
Loading

0 comments on commit 03cdb2a

Please sign in to comment.