Skip to content

Latest commit

 

History

History
49 lines (45 loc) · 1.9 KB

code-models.md

File metadata and controls

49 lines (45 loc) · 1.9 KB

Codestral Mamba

CodeAct

Examples

from  transformers  import  GPT2LMHeadModel , GPT2Tokenizer , TextDataset , DataCollatorForLanguageModeling   
from  transformers  import  Trainer , TrainingArguments 
# Initialize tokenizer and model
tokenizer  =  GPT2Tokenizer . from_pretrained ( "gpt2" )
model  =  GPT2LMHeadModel . from_pretrained ( "gpt2" )
# Assume code_corpus.txt is your code data file
train_file  =  "code_corpus.txt"
# Use tokenizer to process the dataset
def  load_dataset ( file_path ) :
    with  open ( file_path , "r" ) as f :   
        text  =  f . read ( )
    tokenized_text  =  tokenizer . convert_tokens_to_ids ( tokenizer . tokenize ( text ) )
    return  [ tokenizer . build_inputs_with_special_tokens ( tokenized_text ) ]
dataset  =  load_dataset ( train_file )
# Dataset segmentation and processing
train_dataset  =  TextDataset ( dataset )
data_collator  =  DataCollatorForLanguageModeling ( tokenizer = tokenizer , mlm = False ) 
# Set training parameters
training_args  =  TrainingArguments (
    output_dir = "./results" ,
    overwrite_output_dir = True ,
    num_train_epochs = 1 , # More rounds should be set for actual training  
    per_device_train_batch_size = 4 ,
    save_steps = 10_000 ,
    save_total_limit = 2 ,
)
# Create Trainer and start training
trainer  =  Trainer (
    model = model ,
    args = training_args ,
    data_collator = data_collator ,
    train_dataset = train_dataset ,
)
trainer . train ( )
# Generate code example
input_prompt  =  "def add_numbers(a, b):"
input_ids  =  tokenizer . encode ( input_prompt , return_tensors = "pt" ) 
sample_outputs  =  model . generate ( input_ids , max_length = 50 , num_return_sequences = 1 , do_sample = True )   
generated_code  =  tokenizer . decode ( sample_outputs [ 0 ] , skip_special_tokens = True ) 
print ( generated_code )