Source code for czsc.ai.sk

# -*- coding: utf-8 -*-
"""
author: zengbin93
email: zeng_bin8888@163.com
create_dt: 2022/3/28 17:48
describe: 基于 Scikit-Learn, Xgboost, LightGBM 三个机器学习库的分类、回归模型
"""
import pandas as pd
from sklearn import metrics
from sklearn.pipeline import Pipeline
from typing import List, AnyStr

from .utils import get_datetime_spans


[docs]def evaluate_estimator(estimator: Pipeline, df: pd.DataFrame, x_cols: List[AnyStr], y_col: AnyStr = None): """评估模型表现 :param estimator: 使用 Pipeline 组合成模型 :param df: 输入数据 :param x_cols: 特征名序列 :param y_col: 真实值列 :return: """ y_pred = estimator.predict(df[x_cols]) df['y_pred'] = y_pred if not y_col: return df model_type = estimator.steps[-1][0] sdt = df['dt'].min().strftime("%Y%m%d") edt = df['dt'].max().strftime("%Y%m%d") y_ture = df[y_col] if model_type.lower() == 'regressor': print(f"{sdt} - {edt} 回归模型评估: \nr2_score = %.3f; MSE = %.3f; MAE = %.3f" % (metrics.r2_score(y_ture, y_pred), metrics.mean_squared_error(y_ture, y_pred), metrics.mean_absolute_error(y_ture, y_pred))) elif model_type.lower() == 'classifier': print(f"{sdt} - {edt} 分类模型评估: \n{metrics.classification_report(y_ture, y_pred)}") else: raise ValueError return df
[docs]def train_estimator(estimator: Pipeline, df: pd.DataFrame, x_cols: List[AnyStr], y_col: AnyStr, train_days: int, valid_days: int, method="rolling"): """训练模型 :param estimator: 使用 Pipeline 组合成模型 :param df: 数据对象 :param x_cols: 特征名序列 :param y_col: 真实值列 :param train_days: 训练集时间跨度 :param valid_days: 验证集时间跨度 :param method: 时间窗口滚动方法,rolling 滑动窗口,expanding 扩张窗口 :return: """ sdt = df['dt'].min() edt = df['dt'].max() spans = get_datetime_spans(sdt, edt, train_days, valid_days, method) predicts = [] for train_sdt, train_edt, valid_sdt, valid_edt in spans: train = df[(df['dt'] >= train_sdt) & (df['dt'] <= train_edt)].copy() # 注意:由于时间窗口滚动过程,train_edt == valid_sdt,所以,划分验证集时,必须要大于 valid_sdt valid = df[(df['dt'] > valid_sdt) & (df['dt'] <= valid_edt)].copy() if valid.empty: continue print("=" * 100) print("测试区间:{} ~ {}\n".format(valid_sdt, valid_edt)) print('样本比例 = 训练集:测试集 = {}: {} \n'.format(len(train), len(valid))) estimator.fit(train[x_cols], train[y_col]) valid['y_pred'] = estimator.predict(valid[x_cols]) predicts.append(valid) evaluate_estimator(estimator, train, x_cols, y_col) evaluate_estimator(estimator, valid, x_cols, y_col) dfr = pd.concat(predicts) return dfr, estimator