我们项目中大部分训练是一个持续的过程,如果每次都从头开始,那这可就是一个繁重的任务了,比如我的数据来源是一个持续的过程,那最优解决方案就是填鸭式的训练而不是每次重头开始,官方文档中也是掐头去尾,验证过程中总是不能顺利通过,黄天不负有心终于在认真推敲,翻阅网络资料与数次实践中顺利完成,接下来我们用实际行动来验证一番,如有错误还烦请各位大神指出。
1.使用vs 2022自动选择最优算法生成训练基础代码,我们使用基础代码进行修改以满足重新训练的使用。
2使用vs 2022 生成的代码如下,因是重新训练,其他代码先忽略
// This file was auto-generated by ML.NET Model Builder.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Microsoft.ML.Data;
using Microsoft.ML.Trainers;
using Microsoft.ML;
namespace MLRetraining
{
public partial class SmsClassification
{
/// <summary>
/// Retrains model using the pipeline generated as part of the training process. For more information on how to load data, see aka.ms/loaddata.
/// </summary>
/// <param name="mlContext"></param>
/// <param name="trainData"></param>
/// <returns></returns>
public static ITransformer RetrainPipeline(MLContext mlContext, IDataView trainData)
{
var pipeline = BuildPipeline(mlContext);
var model = pipeline.Fit(trainData);
return model;
}
/// <summary>
/// build the pipeline that is used from model builder. Use this function to retrain model.
/// </summary>
/// <param name="mlContext"></param>
/// <returns></returns>
public static IEstimator<ITransformer> BuildPipeline(MLContext mlContext)
{
// Data process configuration with pipeline data transformations
var pipeline = mlContext.Transforms.Text.FeaturizeText(inputColumnName:@"col0",outputColumnName:@"col0")
.Append(mlContext.Transforms.Concatenate(@"Features", new []{@"col0"}))
.Append(mlContext.Transforms.Conversion.MapValueToKey(outputColumnName:@"col1",inputColumnName:@"col1"))
.Append(mlContext.Transforms.NormalizeMinMax(@"Features", @"Features"))
.Append(mlContext.MulticlassClassification.Trainers.LbfgsMaximumEntropy(new LbfgsMaximumEntropyMulticlassTrainer.Options(){L1Regularization=1F,L2Regularization=1F,LabelColumnName=@"col1",FeatureColumnName=@"Features"}))
.Append(mlContext.Transforms.Conversion.MapKeyToValue(outputColumnName:@"PredictedLabel",inputColumnName:@"PredictedLabel"));
return pipeline;
}
}
}
3 使用以上代码修改为单独数据准备管道和训练的模型
//首次训练
public static void FirstTranin()
{
MLContext mlContext = new MLContext();
var dataPrepEstimator = mlContext.Transforms.Text.FeaturizeText(inputColumnName: @"col0", outputColumnName: @"col0")
.Append(mlContext.Transforms.Concatenate(@"Features", new[] { @"col0" }))
.Append(mlContext.Transforms.Conversion.MapValueToKey(outputColumnName: @"col1", inputColumnName: @"col1"))
.Append(mlContext.Transforms.NormalizeMinMax(@"Features", @"Features"));
//注意这俩行需要注释,
//否则会导致不准确 //.Append(mlContext.MulticlassClassification.Trainers.LbfgsMaximumEntropy(new LbfgsMaximumEntropyMulticlassTrainer.Options() { L1Regularization = 1F, L2Regularization = 1F, LabelColumnName = @"col1", FeatureColumnName = @"Features" }))
//.Append(mlContext.Transforms.Conversion.MapKeyToValue( @"PredictedLabel"));
string _dataPath = Path.Combine(Environment.CurrentDirectory, "原始文本confirm.txt");
var data = mlContext.Data.LoadFromTextFile<SmsClassification.ModelInput>(path: _dataPath, separatorChar: '|');
ITransformer dataPrepTransformer = dataPrepEstimator.Fit(data);
var sdcaEstimator = mlContext.MulticlassClassification.Trainers.LbfgsMaximumEntropy(new LbfgsMaximumEntropyMulticlassTrainer.Options() { L1Regularization = 1F, L2Regularization = 1F, LabelColumnName = @"col1", FeatureColumnName = @"Features" });
//
sdcaEstimator.Append(mlContext.Transforms.Conversion.MapKeyToValue(@"PredictedLabel"));
IDataView transformedData = dataPrepTransformer.Transform(data);
var trainedModel = sdcaEstimator.Fit(transformedData);
mlContext.Model.Save(dataPrepTransformer, data.Schema, "data_preparation_pipeline.zip");
// Save Trained Model
mlContext.Model.Save(trainedModel, transformedData.Schema, "model.zip");
}
4 重新训练(将新数据添加到原有模型中)
public static void ReTranin(ModelInput[] data)
{
// Create MLContext
MLContext mlContext = new MLContext();
// Define data preparation and trained model schemas
DataViewSchema dataPrepPipelineSchema, modelSchema;
// Load data preparation pipeline and trained model
ITransformer dataPrepPipeline = mlContext.Model.Load("data_preparation_pipeline.zip", out dataPrepPipelineSchema);
ITransformer trainedModel = mlContext.Model.Load("model.zip", out modelSchema);
IDataView newData = mlContext.Data.LoadFromEnumerable<ModelInput>(data);
// Preprocess Data
IDataView transformedNewData = dataPrepPipeline.Transform(newData);
var originalModelParameters =
((ISingleFeaturePredictionTransformer<object>)trainedModel).Model as MaximumEntropyModelParameters;
MulticlassPredictionTransformer<MaximumEntropyModelParameters> retrainedModel =
mlContext.MulticlassClassification.Trainers.LbfgsMaximumEntropy(new LbfgsMaximumEntropyMulticlassTrainer.Options() { L1Regularization = 1F, L2Regularization = 1F, LabelColumnName = @"col1", FeatureColumnName = @"Features" })
.Fit(transformedNewData, originalModelParameters);
// Save Data Prep transformer
//mlContext.Model.Save(dataPrepTransformer, data.Schema, "data_preparation_pipeline.zip");
// Save Trained Model
mlContext.Model.Save(retrainedModel, transformedNewData.Schema, "model.zip");
}
5 使用
public static ModelOutput Predict(ModelInput input)
{
MLContext mlContext = new MLContext();
// Define data preparation and trained model schemas
DataViewSchema dataPrepPipelineSchema, modelSchema;
// Load data preparation pipeline and trained model
ITransformer loadedModel = mlContext.Model.Load("data_preparation_pipeline.zip", out dataPrepPipelineSchema);
ITransformer trainedMode = loadedModel.Append(mlContext.Model.Load("model.zip", out modelSchema));
//ITransformer trainedModel = mlContext.Model.Load("model.zip", out modelSchema);
var predictor = mlContext.Model.CreatePredictionEngine<ModelInput, ModelOutput>(trainedMode);
var predictionResult = predictor.Predict(input);
return predictionResult;
}