fine-tuning
初级
未知
未知作者
更新于 2025-06-14
LLM微调与适配技术
微调概述
LLM微调(Fine-tuning)是在预训练模型基础上,使用特定领域或任务的数据进行进一步训练,以提升模型在特定场景下的表现。微调技术包括全参数微调、参数高效微调(PEFT)等方法。
微调数据准备
1. 数据格式化
PYTHON
import json
import pandas as pd
from typing import List, Dict, Any
from dataclasses import dataclass
@dataclass
class TrainingExample:
"""训练样本"""
instruction: str
input: str
output: str
metadata: Dict[str, Any] = None
class DataProcessor:
"""数据处理器"""
def __init__(self):
self.examples = []
def load_from_json(self, filepath: str):
"""从JSON文件加载数据"""
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
for item in data:
example = TrainingExample(
instruction=item.get('instruction', ''),
input=item.get('input', ''),
output=item.get('output', ''),
metadata=item.get('metadata', {})
)
self.examples.append(example)
print(f"加载了 {len(self.examples)} 个训练样本")
def create_instruction_dataset(self, examples: List[TrainingExample]) -> List[Dict[str, str]]:
"""创建指令微调数据集"""
dataset = []
for example in examples:
# 构建完整的提示
if example.input:
prompt = f"指令: {example.instruction}\n输入: {example.input}\n输出: "
else:
prompt = f"指令: {example.instruction}\n输出: "
dataset.append({
"prompt": prompt,
"completion": example.output
})
return dataset
def create_chat_dataset(self, examples: List[TrainingExample]) -> List[Dict[str, Any]]:
"""创建对话微调数据集"""
dataset = []
for example in examples:
messages = [
{"role": "system", "content": "你是一个有用的AI助手。"},
{"role": "user", "content": example.instruction + (f"\n{example.input}" if example.input else "")},
{"role": "assistant", "content": example.output}
]
dataset.append({"messages": messages})
return dataset
def validate_dataset(self, dataset: List[Dict]) -> Dict[str, Any]:
"""验证数据集质量"""
stats = {
"total_examples": len(dataset),
"avg_prompt_length": 0,
"avg_completion_length": 0,
"empty_examples": 0,
"long_examples": 0
}
prompt_lengths = []
completion_lengths = []
for item in dataset:
if "prompt" in item and "completion" in item:
prompt_len = len(item["prompt"])
completion_len = len(item["completion"])
prompt_lengths.append(prompt_len)
completion_lengths.append(completion_len)
if not item["prompt"] or not item["completion"]:
stats["empty_examples"] += 1
if prompt_len + completion_len > 4000:
stats["long_examples"] += 1
if prompt_lengths:
stats["avg_prompt_length"] = sum(prompt_lengths) / len(prompt_lengths)
stats["avg_completion_length"] = sum(completion_lengths) / len(completion_lengths)
return stats
# 使用示例
processor = DataProcessor()
# 创建示例数据
sample_examples = [
TrainingExample(
instruction="将以下文本翻译成英文",
input="你好,世界!",
output="Hello, World!"
),
TrainingExample(
instruction="解释什么是机器学习",
input="",
output="机器学习是人工智能的一个分支,它使计算机能够在没有明确编程的情况下学习和改进。"
)
]
# 创建不同格式的数据集
instruction_dataset = processor.create_instruction_dataset(sample_examples)
chat_dataset = processor.create_chat_dataset(sample_examples)
print("指令数据集示例:")
print(json.dumps(instruction_dataset[0], ensure_ascii=False, indent=2))
print("\n对话数据集示例:")
print(json.dumps(chat_dataset[0], ensure_ascii=False, indent=2))
# 验证数据集
stats = processor.validate_dataset(instruction_dataset)
print(f"\n数据集统计: {stats}")
2. LoRA微调实现
PYTHON
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
import numpy as np
class LoRAFineTuner:
"""LoRA微调器"""
def __init__(self, model_name: str, lora_config: Dict[str, Any] = None):
self.model_name = model_name
self.tokenizer = None
self.model = None
self.peft_model = None
# 默认LoRA配置
self.lora_config = lora_config or {
"r": 16, # LoRA rank
"lora_alpha": 32, # LoRA scaling parameter
"target_modules": ["q_proj", "v_proj"], # 目标模块
"lora_dropout": 0.1,
"bias": "none",
"task_type": TaskType.CAUSAL_LM
}
def load_model(self):
"""加载模型和分词器"""
print(f"加载模型: {self.model_name}")
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
self.model = AutoModelForCausalLM.from_pretrained(
self.model_name,
torch_dtype=torch.float16,
device_map="auto"
)
# 应用LoRA
lora_config = LoraConfig(**self.lora_config)
self.peft_model = get_peft_model(self.model, lora_config)
print(f"LoRA参数数量: {self.peft_model.num_parameters()}")
print(f"可训练参数数量: {self.peft_model.num_parameters(only_trainable=True)}")
def prepare_dataset(self, dataset: List[Dict[str, str]], max_length: int = 512):
"""准备训练数据集"""
def tokenize_function(examples):
# 组合prompt和completion
texts = [prompt + completion for prompt, completion in
zip(examples["prompt"], examples["completion"])]
# 分词
tokenized = self.tokenizer(
texts,
truncation=True,
padding=True,
max_length=max_length,
return_tensors="pt"
)
# 设置labels(用于计算loss)
tokenized["labels"] = tokenized["input_ids"].clone()
return tokenized
# 转换为Hugging Face Dataset格式
from datasets import Dataset
df = pd.DataFrame(dataset)
hf_dataset = Dataset.from_pandas(df)
# 应用分词
tokenized_dataset = hf_dataset.map(
tokenize_function,
batched=True,
remove_columns=hf_dataset.column_names
)
return tokenized_dataset
def train(self, train_dataset, eval_dataset=None, output_dir: str = "./lora_output"):
"""训练模型"""
training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=3,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
gradient_accumulation_steps=4,
warmup_steps=100,
learning_rate=2e-4,
fp16=True,
logging_steps=10,
save_steps=500,
eval_steps=500,
evaluation_strategy="steps" if eval_dataset else "no",
save_total_limit=2,
load_best_model_at_end=True if eval_dataset else False,
report_to=None # 禁用wandb等日志
)
trainer = Trainer(
model=self.peft_model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
tokenizer=self.tokenizer,
)
print("开始训练...")
trainer.train()
# 保存模型
trainer.save_model()
print(f"模型已保存到: {output_dir}")
def generate_text(self, prompt: str, max_length: int = 200) -> str:
"""生成文本"""
inputs = self.tokenizer(prompt, return_tensors="pt")
with torch.no_grad():
outputs = self.peft_model.generate(
**inputs,
max_length=max_length,
temperature=0.7,
do_sample=True,
pad_token_id=self.tokenizer.eos_token_id
)
generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
# 移除原始prompt
if generated_text.startswith(prompt):
generated_text = generated_text[len(prompt):].strip()
return generated_text
def save_lora_weights(self, save_path: str):
"""保存LoRA权重"""
self.peft_model.save_pretrained(save_path)
print(f"LoRA权重已保存到: {save_path}")
def load_lora_weights(self, load_path: str):
"""加载LoRA权重"""
from peft import PeftModel
self.peft_model = PeftModel.from_pretrained(self.model, load_path)
print(f"LoRA权重已从 {load_path} 加载")
# 使用示例(需要GPU环境)
"""
# 初始化微调器
fine_tuner = LoRAFineTuner("microsoft/DialoGPT-medium")
# 加载模型
fine_tuner.load_model()
# 准备数据集
train_data = processor.create_instruction_dataset(sample_examples)
train_dataset = fine_tuner.prepare_dataset(train_data)
# 开始训练
fine_tuner.train(train_dataset)
# 测试生成
prompt = "指令: 解释什么是深度学习\n输出: "
response = fine_tuner.generate_text(prompt)
print(f"生成的回答: {response}")
"""
3. 模型评估
PYTHON
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from rouge_score import rouge_scorer
from bert_score import score as bert_score
class ModelEvaluator:
"""模型评估器"""
def __init__(self):
self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
def evaluate_generation(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
"""评估文本生成质量"""
metrics = {}
# ROUGE评分
rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
for pred, ref in zip(predictions, references):
scores = self.rouge_scorer.score(ref, pred)
for metric in rouge_scores:
rouge_scores[metric].append(scores[metric].fmeasure)
for metric in rouge_scores:
metrics[f'{metric}_f1'] = np.mean(rouge_scores[metric])
# BERT Score
P, R, F1 = bert_score(predictions, references, lang='zh', verbose=False)
metrics['bert_score_f1'] = F1.mean().item()
# 长度统计
pred_lengths = [len(pred.split()) for pred in predictions]
ref_lengths = [len(ref.split()) for ref in references]
metrics['avg_pred_length'] = np.mean(pred_lengths)
metrics['avg_ref_length'] = np.mean(ref_lengths)
metrics['length_ratio'] = np.mean(pred_lengths) / np.mean(ref_lengths)
return metrics
def evaluate_classification(self, predictions: List[str], references: List[str],
labels: List[str]) -> Dict[str, float]:
"""评估分类任务"""
# 将文本预测转换为标签
pred_labels = self._extract_labels(predictions, labels)
ref_labels = self._extract_labels(references, labels)
metrics = {
'accuracy': accuracy_score(ref_labels, pred_labels),
'f1_macro': f1_score(ref_labels, pred_labels, average='macro'),
'f1_micro': f1_score(ref_labels, pred_labels, average='micro')
}
return metrics
def _extract_labels(self, texts: List[str], possible_labels: List[str]) -> List[str]:
"""从文本中提取标签"""
extracted = []
for text in texts:
text_lower = text.lower()
found_label = None
for label in possible_labels:
if label.lower() in text_lower:
found_label = label
break
extracted.append(found_label or possible_labels[0]) # 默认第一个标签
return extracted
def comprehensive_evaluation(self, model, test_dataset: List[Dict[str, str]]) -> Dict[str, Any]:
"""综合评估"""
predictions = []
references = []
print("生成预测结果...")
for i, example in enumerate(test_dataset):
if i % 10 == 0:
print(f"处理进度: {i}/{len(test_dataset)}")
# 生成预测
pred = model.generate_text(example["prompt"])
predictions.append(pred)
references.append(example["completion"])
# 计算各种指标
generation_metrics = self.evaluate_generation(predictions, references)
# 计算额外指标
additional_metrics = {
'total_examples': len(test_dataset),
'avg_generation_time': 0.0, # 需要在实际使用中测量
'empty_predictions': sum(1 for pred in predictions if not pred.strip()),
'repetitive_predictions': self._count_repetitive(predictions)
}
return {
'generation_metrics': generation_metrics,
'additional_metrics': additional_metrics,
'sample_predictions': list(zip(predictions[:5], references[:5]))
}
def _count_repetitive(self, predictions: List[str]) -> int:
"""统计重复预测"""
unique_predictions = set(predictions)
return len(predictions) - len(unique_predictions)
# 使用示例
evaluator = ModelEvaluator()
# 模拟评估数据
sample_predictions = [
"机器学习是人工智能的一个重要分支。",
"深度学习使用神经网络进行学习。",
"自然语言处理处理人类语言。"
]
sample_references = [
"机器学习是AI的核心技术之一。",
"深度学习基于多层神经网络。",
"NLP技术用于理解和生成人类语言。"
]
# 评估生成质量
metrics = evaluator.evaluate_generation(sample_predictions, sample_references)
print("生成质量评估:")
for metric, value in metrics.items():
print(f"{metric}: {value:.4f}")
小结
在本章中,我们学习了:
- 微调基础:全参数微调vs参数高效微调
- 数据准备:训练数据的格式化和验证
- LoRA微调:参数高效的微调方法实现
- 模型评估:多维度的模型性能评估
- 实践技巧:微调过程中的注意事项
微调技术让我们能够将通用LLM适配到特定领域和任务,是构建专业AI应用的重要手段。
思考题
- 如何选择合适的微调策略?
- 在什么情况下应该使用LoRA而不是全参数微调?
- 如何防止微调过程中的过拟合?
- 如何评估微调模型的泛化能力?
下一章我们将学习模型评估与性能优化。