赞
踩
因果语言模型(Causal language modeling,CLM) 是用来预测句子中的下一位置处的字符。
from datasets import load_dataset
datasets = load_dataset('wikitext', 'wikitext-2-raw-v1')
model_checkpoint = "distilgpt2" from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True) def tokenize_function(examples): return tokenizer(examples["text"]) tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"]) #进程num_proc=4,如果报错可以删除batched=True, num_proc=4 # 数据分块、拼接 def group_texts(examples): # 拼接所有文本 block_size = 128 #文本分割成block_size大小的块 concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) # 我们将余数对应的部分去掉。但如果模型支持的话,可以添加padding total_length = (total_length // block_size) * block_size # 通过max_len进行分割。 result = { k: [t[i : i + block_size] for i in range(0, total_length, block_size)] for k, t in concatenated_examples.items() } result["labels"] = result["input_ids"].copy() return result # 函数调用 lm_datasets = tokenized_datasets.map( group_texts, batched=True, batch_size=1000, num_proc=4, )
from transformers import AutoModelForCausalLM import importlib.util import importlib_metadata from transformers import Trainer, TrainingArguments import math # 定义模型 model = AutoModelForCausalLM.from_pretrained(model_checkpoint) # 训练设定参数 training_args = TrainingArguments( "test-clm", evaluation_strategy = "epoch", learning_rate=2e-5, weight_decay=0.01, ) # 定义trainer trainer = Trainer( model=model, args=training_args, train_dataset=lm_datasets["train"][:1000], eval_dataset=lm_datasets["validation"][:1000], ) # 训练 trainer.train() # 评价 eval_results = trainer.evaluate() print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
掩蔽语言模型(Masked language modeling,MLM) 是用来恢复输入中被"MASK"掉的一些字符。
from datasets import load_dataset
datasets = load_dataset('wikitext', 'wikitext-2-raw-v1')
model_checkpoint = "distilroberta-base" from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True) tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"]) # 数据分块、拼接 def group_texts(examples): # 拼接所有文本 block_size = 128 #文本分割成block_size大小的块 concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) # 我们将余数对应的部分去掉。但如果模型支持的话,可以添加padding total_length = (total_length // block_size) * block_size # 通过max_len进行分割。 result = { k: [t[i : i + block_size] for i in range(0, total_length, block_size)] for k, t in concatenated_examples.items() } result["labels"] = result["input_ids"].copy() return result # 函数调用 lm_datasets = tokenized_datasets.map( group_texts, batched=True, batch_size=1000, num_proc=4, )
import importlib.util import importlib_metadata from transformers import Trainer, TrainingArguments import math from transformers import AutoModelForMaskedLM # 定义模型 model = AutoModelForMaskedLM.from_pretrained(model_checkpoint) # 定义data collector from transformers import DataCollatorForLanguageModeling data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15) #进行随机mask,mlm_probability为mask的遮蔽率 # 训练设定参数 training_args = TrainingArguments( "test-clm", evaluation_strategy = "epoch", learning_rate=2e-5, weight_decay=0.01, ) # 定义trainer trainer = Trainer( model=model, args=training_args, train_dataset=lm_datasets["train"][:1000], eval_dataset=lm_datasets["validation"][:100], data_collator=data_collator, ) # 训练 trainer.train() # 评价 eval_results = trainer.evaluate() print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
这个库若没有安装,后续加载数据可能会报错
pip install datasets transformers sacrebleu==1.5.1 sentencepiece
model_checkpoint = "Helsinki-NLP/opus-mt-en-ro"
from datasets import load_dataset, load_metric
raw_datasets = load_dataset("wmt16", "ro-en")
metric = load_metric("sacrebleu")
翻译双语lang参考网站,其中zh_CN表示中文
from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) if "mbart" in model_checkpoint: tokenizer.src_lang = "en-XX" #待翻译语言 tokenizer.tgt_lang = "ro-RO" #翻译目标语言 if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]: prefix = "translate English to Romanian: " else: prefix = "" # 预处理函数 max_input_length = 128 max_target_length = 128 source_lang = "en" target_lang = "ro" def preprocess_function(examples): inputs = [prefix + ex[source_lang] for ex in examples["translation"]] targets = [ex[target_lang] for ex in examples["translation"]] model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True) # Setup the tokenizer for targets with tokenizer.as_target_tokenizer(): labels = tokenizer(targets, max_length=max_target_length, truncation=True) model_inputs["labels"] = labels["input_ids"] return model_inputs # 函数调用 tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer # 定义模型 model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) # 参数定义 batch_size = 16 args = Seq2SeqTrainingArguments( "test-translation", evaluation_strategy = "epoch", learning_rate=2e-5, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, weight_decay=0.01, save_total_limit=3, num_train_epochs=1, predict_with_generate=True, fp16=False, ) # 定义data collector data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) # 定义数据后处理函数 import numpy as np def postprocess_text(preds, labels): preds = [pred.strip() for pred in preds] labels = [[label.strip()] for label in labels] return preds, labels def compute_metrics(eval_preds): preds, labels = eval_preds if isinstance(preds, tuple): preds = preds[0] decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) # Replace -100 in the labels as we can't decode them. labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # Some simple post-processing decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels) result = metric.compute(predictions=decoded_preds, references=decoded_labels) result = {"bleu": result["score"]} prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds] result["gen_len"] = np.mean(prediction_lens) result = {k: round(v, 4) for k, v in result.items()} return result # 定义trainer trainer = Seq2SeqTrainer( model, args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["validation"], data_collator=data_collator, tokenizer=tokenizer, compute_metrics=compute_metrics ) # 训练 trainer.train() # 评价 # eval_results = trainer.evaluate()
这个库若没有安装,后续加载数据可能会报错
pip install datasets transformers rouge-score nltk
model_checkpoint = "t5-small"
from datasets import load_dataset, load_metric
raw_datasets = load_dataset("xsum")
metric = load_metric("rouge")
from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]: prefix = "summarize: " else: prefix = "" # 预处理函数 max_input_length = 1024 max_target_length = 128 def preprocess_function(examples): inputs = [prefix + doc for doc in examples["document"]] model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True) # Setup the tokenizer for targets with tokenizer.as_target_tokenizer(): labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True) model_inputs["labels"] = labels["input_ids"] return model_inputs # 函数调用 tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer # 定义模型 model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) # 参数定义 batch_size = 16 args = Seq2SeqTrainingArguments( "test-summarization", evaluation_strategy = "epoch", learning_rate=2e-5, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, weight_decay=0.01, save_total_limit=3, num_train_epochs=1, predict_with_generate=True, fp16=True, ) # 定义data collector data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) # 定义数据后处理函数 import nltk import numpy as np def compute_metrics(eval_pred): predictions, labels = eval_pred decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) # Replace -100 in the labels as we can't decode them. labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # Rouge expects a newline after each sentence decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds] decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels] result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True) # Extract a few results result = {key: value.mid.fmeasure * 100 for key, value in result.items()} # Add mean generated length prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions] result["gen_len"] = np.mean(prediction_lens) return {k: round(v, 4) for k, v in result.items()} # 定义trainer trainer = Seq2SeqTrainer( model, args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["validation"], data_collator=data_collator, tokenizer=tokenizer, compute_metrics=compute_metrics ) # 训练 trainer.train() # 评价 # eval_results = trainer.evaluate()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。