from datasets import load_dataset from transformers import AutoTokenizer, DataCollatorWithPadding from torch.utils.data import DataLoader from transformers import AutoModelForSequenceClassification from transformers import AdamW from transformers import get_scheduler import torch from tqdm.auto import tqdm import evaluate raw_datasets = load_dataset("glue","mrpc") checkpoint = 'bert-base-cased' tokenizer = AutoTokenizer.from_pretrained(checkpoint) def tokenize_function(example): return tokenizer(example['sentence1'], example['sentence2'], truncation=True) tokenized_dataset = raw_datasets.map(tokenize_function, batched=True) tokenized_dataset = tokenized_dataset.remove_columns(['sentence1', 'sentence2','idx']) tokenized_dataset = tokenized_dataset.rename_column('label','labels') #print(tokenized_dataset.column_names["train"]) tokenized_dataset.set_format('torch') #print(tokenized_dataset) data_collator = DataCollatorWithPadding(tokenizer) train_dataloader = DataLoader( tokenized_dataset['validation'], batch_size=8, collate_fn=data_collator ) eval_dataloader = DataLoader( tokenized_dataset['validation'], batch_size=8, collate_fn=data_collator ) #for batch in train_dataloader: # break #print({k: v.shape for k, v in batch.items()}) #print() #print(batch) #print() model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) #outputs = model(**batch) #print(outputs.loss, outputs.logits.shape) optimizer = AdamW(model.parameters(), lr=5e-5) #loss = outputs.loss #loss.backward() #optimizer.step() #optimizer.zero_grad() num_epochs = 3 num_training_steps = num_epochs * len(train_dataloader) lr_scheduler = get_scheduler( 'linear', optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps ) device = torch.device('mps') if torch.backends.mps.is_available() else torch.device('cpu') model.to(device) print(f'Using device: {device}') progress_bar = tqdm(range(num_training_steps)) model.train() for epoch in range(num_epochs): for batch in train_dataloader: batch = {k: v.to(device) for k, v in batch.items()} outputs = model(**batch) loss = outputs.loss loss.backward() optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) metric= evaluate.load('glue','mrpc') model.eval() for batch in eval_dataloader: batch = {k: v.to(device) for k, v in batch.items()} with torch.no_grad(): outputs = model(**batch) logits = outputs.logits predictions = torch.argmax(logits, dim=-1) metric.add_batch(predictions=predictions, references=batch['labels']) result = metric.compute() print(result) save_dir = "/Users/alexandr/Desktop/HUGGING_FACE/model" model.save_pretrained(save_dir) tokenizer.save_pretrained(save_dir) print(f"model and tokenizer saved to {save_dir}")