教程目录

TUTORIAL_SUMMARY

fine-tuning

初级

未知
未知作者
更新于 2025-06-14

LLM微调与适配技术

微调概述

LLM微调(Fine-tuning)是在预训练模型基础上,使用特定领域或任务的数据进行进一步训练,以提升模型在特定场景下的表现。微调技术包括全参数微调、参数高效微调(PEFT)等方法。

微调数据准备

1. 数据格式化

PYTHON
import json
import pandas as pd
from typing import List, Dict, Any
from dataclasses import dataclass
 
@dataclass
class TrainingExample:
    """训练样本"""
    instruction: str
    input: str
    output: str
    metadata: Dict[str, Any] = None
 
class DataProcessor:
    """数据处理器"""
    
    def __init__(self):
        self.examples = []
    
    def load_from_json(self, filepath: str):
        """从JSON文件加载数据"""
        with open(filepath, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        for item in data:
            example = TrainingExample(
                instruction=item.get('instruction', ''),
                input=item.get('input', ''),
                output=item.get('output', ''),
                metadata=item.get('metadata', {})
            )
            self.examples.append(example)
        
        print(f"加载了 {len(self.examples)} 个训练样本")
    
    def create_instruction_dataset(self, examples: List[TrainingExample]) -> List[Dict[str, str]]:
        """创建指令微调数据集"""
        dataset = []
        
        for example in examples:
            # 构建完整的提示
            if example.input:
                prompt = f"指令: {example.instruction}\n输入: {example.input}\n输出: "
            else:
                prompt = f"指令: {example.instruction}\n输出: "
            
            dataset.append({
                "prompt": prompt,
                "completion": example.output
            })
        
        return dataset
    
    def create_chat_dataset(self, examples: List[TrainingExample]) -> List[Dict[str, Any]]:
        """创建对话微调数据集"""
        dataset = []
        
        for example in examples:
            messages = [
                {"role": "system", "content": "你是一个有用的AI助手。"},
                {"role": "user", "content": example.instruction + (f"\n{example.input}" if example.input else "")},
                {"role": "assistant", "content": example.output}
            ]
            
            dataset.append({"messages": messages})
        
        return dataset
    
    def validate_dataset(self, dataset: List[Dict]) -> Dict[str, Any]:
        """验证数据集质量"""
        stats = {
            "total_examples": len(dataset),
            "avg_prompt_length": 0,
            "avg_completion_length": 0,
            "empty_examples": 0,
            "long_examples": 0
        }
        
        prompt_lengths = []
        completion_lengths = []
        
        for item in dataset:
            if "prompt" in item and "completion" in item:
                prompt_len = len(item["prompt"])
                completion_len = len(item["completion"])
                
                prompt_lengths.append(prompt_len)
                completion_lengths.append(completion_len)
                
                if not item["prompt"] or not item["completion"]:
                    stats["empty_examples"] += 1
                
                if prompt_len + completion_len > 4000:
                    stats["long_examples"] += 1
        
        if prompt_lengths:
            stats["avg_prompt_length"] = sum(prompt_lengths) / len(prompt_lengths)
            stats["avg_completion_length"] = sum(completion_lengths) / len(completion_lengths)
        
        return stats
 
# 使用示例
processor = DataProcessor()
 
# 创建示例数据
sample_examples = [
    TrainingExample(
        instruction="将以下文本翻译成英文",
        input="你好,世界!",
        output="Hello, World!"
    ),
    TrainingExample(
        instruction="解释什么是机器学习",
        input="",
        output="机器学习是人工智能的一个分支,它使计算机能够在没有明确编程的情况下学习和改进。"
    )
]
 
# 创建不同格式的数据集
instruction_dataset = processor.create_instruction_dataset(sample_examples)
chat_dataset = processor.create_chat_dataset(sample_examples)
 
print("指令数据集示例:")
print(json.dumps(instruction_dataset[0], ensure_ascii=False, indent=2))
 
print("\n对话数据集示例:")
print(json.dumps(chat_dataset[0], ensure_ascii=False, indent=2))
 
# 验证数据集
stats = processor.validate_dataset(instruction_dataset)
print(f"\n数据集统计: {stats}")

2. LoRA微调实现

PYTHON
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
import numpy as np
 
class LoRAFineTuner:
    """LoRA微调器"""
    
    def __init__(self, model_name: str, lora_config: Dict[str, Any] = None):
        self.model_name = model_name
        self.tokenizer = None
        self.model = None
        self.peft_model = None
        
        # 默认LoRA配置
        self.lora_config = lora_config or {
            "r": 16,  # LoRA rank
            "lora_alpha": 32,  # LoRA scaling parameter
            "target_modules": ["q_proj", "v_proj"],  # 目标模块
            "lora_dropout": 0.1,
            "bias": "none",
            "task_type": TaskType.CAUSAL_LM
        }
    
    def load_model(self):
        """加载模型和分词器"""
        print(f"加载模型: {self.model_name}")
        
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            torch_dtype=torch.float16,
            device_map="auto"
        )
        
        # 应用LoRA
        lora_config = LoraConfig(**self.lora_config)
        self.peft_model = get_peft_model(self.model, lora_config)
        
        print(f"LoRA参数数量: {self.peft_model.num_parameters()}")
        print(f"可训练参数数量: {self.peft_model.num_parameters(only_trainable=True)}")
    
    def prepare_dataset(self, dataset: List[Dict[str, str]], max_length: int = 512):
        """准备训练数据集"""
        def tokenize_function(examples):
            # 组合prompt和completion
            texts = [prompt + completion for prompt, completion in 
                    zip(examples["prompt"], examples["completion"])]
            
            # 分词
            tokenized = self.tokenizer(
                texts,
                truncation=True,
                padding=True,
                max_length=max_length,
                return_tensors="pt"
            )
            
            # 设置labels(用于计算loss)
            tokenized["labels"] = tokenized["input_ids"].clone()
            
            return tokenized
        
        # 转换为Hugging Face Dataset格式
        from datasets import Dataset
        
        df = pd.DataFrame(dataset)
        hf_dataset = Dataset.from_pandas(df)
        
        # 应用分词
        tokenized_dataset = hf_dataset.map(
            tokenize_function,
            batched=True,
            remove_columns=hf_dataset.column_names
        )
        
        return tokenized_dataset
    
    def train(self, train_dataset, eval_dataset=None, output_dir: str = "./lora_output"):
        """训练模型"""
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=3,
            per_device_train_batch_size=4,
            per_device_eval_batch_size=4,
            gradient_accumulation_steps=4,
            warmup_steps=100,
            learning_rate=2e-4,
            fp16=True,
            logging_steps=10,
            save_steps=500,
            eval_steps=500,
            evaluation_strategy="steps" if eval_dataset else "no",
            save_total_limit=2,
            load_best_model_at_end=True if eval_dataset else False,
            report_to=None  # 禁用wandb等日志
        )
        
        trainer = Trainer(
            model=self.peft_model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            tokenizer=self.tokenizer,
        )
        
        print("开始训练...")
        trainer.train()
        
        # 保存模型
        trainer.save_model()
        print(f"模型已保存到: {output_dir}")
    
    def generate_text(self, prompt: str, max_length: int = 200) -> str:
        """生成文本"""
        inputs = self.tokenizer(prompt, return_tensors="pt")
        
        with torch.no_grad():
            outputs = self.peft_model.generate(
                **inputs,
                max_length=max_length,
                temperature=0.7,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id
            )
        
        generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # 移除原始prompt
        if generated_text.startswith(prompt):
            generated_text = generated_text[len(prompt):].strip()
        
        return generated_text
    
    def save_lora_weights(self, save_path: str):
        """保存LoRA权重"""
        self.peft_model.save_pretrained(save_path)
        print(f"LoRA权重已保存到: {save_path}")
    
    def load_lora_weights(self, load_path: str):
        """加载LoRA权重"""
        from peft import PeftModel
        
        self.peft_model = PeftModel.from_pretrained(self.model, load_path)
        print(f"LoRA权重已从 {load_path} 加载")
 
# 使用示例(需要GPU环境)
"""
# 初始化微调器
fine_tuner = LoRAFineTuner("microsoft/DialoGPT-medium")
 
# 加载模型
fine_tuner.load_model()
 
# 准备数据集
train_data = processor.create_instruction_dataset(sample_examples)
train_dataset = fine_tuner.prepare_dataset(train_data)
 
# 开始训练
fine_tuner.train(train_dataset)
 
# 测试生成
prompt = "指令: 解释什么是深度学习\n输出: "
response = fine_tuner.generate_text(prompt)
print(f"生成的回答: {response}")
"""

3. 模型评估

PYTHON
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from rouge_score import rouge_scorer
from bert_score import score as bert_score
 
class ModelEvaluator:
    """模型评估器"""
    
    def __init__(self):
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    def evaluate_generation(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
        """评估文本生成质量"""
        metrics = {}
        
        # ROUGE评分
        rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
        
        for pred, ref in zip(predictions, references):
            scores = self.rouge_scorer.score(ref, pred)
            for metric in rouge_scores:
                rouge_scores[metric].append(scores[metric].fmeasure)
        
        for metric in rouge_scores:
            metrics[f'{metric}_f1'] = np.mean(rouge_scores[metric])
        
        # BERT Score
        P, R, F1 = bert_score(predictions, references, lang='zh', verbose=False)
        metrics['bert_score_f1'] = F1.mean().item()
        
        # 长度统计
        pred_lengths = [len(pred.split()) for pred in predictions]
        ref_lengths = [len(ref.split()) for ref in references]
        
        metrics['avg_pred_length'] = np.mean(pred_lengths)
        metrics['avg_ref_length'] = np.mean(ref_lengths)
        metrics['length_ratio'] = np.mean(pred_lengths) / np.mean(ref_lengths)
        
        return metrics
    
    def evaluate_classification(self, predictions: List[str], references: List[str], 
                              labels: List[str]) -> Dict[str, float]:
        """评估分类任务"""
        # 将文本预测转换为标签
        pred_labels = self._extract_labels(predictions, labels)
        ref_labels = self._extract_labels(references, labels)
        
        metrics = {
            'accuracy': accuracy_score(ref_labels, pred_labels),
            'f1_macro': f1_score(ref_labels, pred_labels, average='macro'),
            'f1_micro': f1_score(ref_labels, pred_labels, average='micro')
        }
        
        return metrics
    
    def _extract_labels(self, texts: List[str], possible_labels: List[str]) -> List[str]:
        """从文本中提取标签"""
        extracted = []
        
        for text in texts:
            text_lower = text.lower()
            found_label = None
            
            for label in possible_labels:
                if label.lower() in text_lower:
                    found_label = label
                    break
            
            extracted.append(found_label or possible_labels[0])  # 默认第一个标签
        
        return extracted
    
    def comprehensive_evaluation(self, model, test_dataset: List[Dict[str, str]]) -> Dict[str, Any]:
        """综合评估"""
        predictions = []
        references = []
        
        print("生成预测结果...")
        for i, example in enumerate(test_dataset):
            if i % 10 == 0:
                print(f"处理进度: {i}/{len(test_dataset)}")
            
            # 生成预测
            pred = model.generate_text(example["prompt"])
            predictions.append(pred)
            references.append(example["completion"])
        
        # 计算各种指标
        generation_metrics = self.evaluate_generation(predictions, references)
        
        # 计算额外指标
        additional_metrics = {
            'total_examples': len(test_dataset),
            'avg_generation_time': 0.0,  # 需要在实际使用中测量
            'empty_predictions': sum(1 for pred in predictions if not pred.strip()),
            'repetitive_predictions': self._count_repetitive(predictions)
        }
        
        return {
            'generation_metrics': generation_metrics,
            'additional_metrics': additional_metrics,
            'sample_predictions': list(zip(predictions[:5], references[:5]))
        }
    
    def _count_repetitive(self, predictions: List[str]) -> int:
        """统计重复预测"""
        unique_predictions = set(predictions)
        return len(predictions) - len(unique_predictions)
 
# 使用示例
evaluator = ModelEvaluator()
 
# 模拟评估数据
sample_predictions = [
    "机器学习是人工智能的一个重要分支。",
    "深度学习使用神经网络进行学习。",
    "自然语言处理处理人类语言。"
]
 
sample_references = [
    "机器学习是AI的核心技术之一。",
    "深度学习基于多层神经网络。",
    "NLP技术用于理解和生成人类语言。"
]
 
# 评估生成质量
metrics = evaluator.evaluate_generation(sample_predictions, sample_references)
print("生成质量评估:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")

小结

在本章中,我们学习了:

  1. 微调基础:全参数微调vs参数高效微调
  2. 数据准备:训练数据的格式化和验证
  3. LoRA微调:参数高效的微调方法实现
  4. 模型评估:多维度的模型性能评估
  5. 实践技巧:微调过程中的注意事项

微调技术让我们能够将通用LLM适配到特定领域和任务,是构建专业AI应用的重要手段。

思考题

  1. 如何选择合适的微调策略?
  2. 在什么情况下应该使用LoRA而不是全参数微调?
  3. 如何防止微调过程中的过拟合?
  4. 如何评估微调模型的泛化能力?

下一章我们将学习模型评估与性能优化。