量化也就那么回事吧

120 阅读4分钟
import os.path
import pandas as pd
from tqdm import tqdm
import glob
from paddlets.datasets import TSDataset
from paddlets.transform import StandardScaler
from paddlets.models.forecasting import MLPRegressor, NHiTSModel, RNNBlockRegressor
from paddlets.ensemble import WeightingEnsembleForecaster
import numpy as np

# 创建结果文件夹
result_center = "forecasting_all_result_center"
model_center = "model_forecasting_center_2048_a_b"
# 读取所有CSV文件路径
csv_paths = glob.glob(os.path.join("./tu_share_data_day", "*.csv"))
sum_dam_data = []

# 函数:处理特征中的 NaN 和 inf
def handle_nan_and_inf(data):
    data = data.ffill().fillna(0)
    data.replace([np.inf, -np.inf], np.nan, inplace=True)
    data.ffill(inplace=True)
    data.fillna(0, inplace=True)
    return data

# 函数:计算特征
def calculate_features(data):
    data['MA5'] = data['close'].rolling(window=5).mean()
    data['MA10'] = data['close'].rolling(window=10).mean()
    data['MA20'] = data['close'].rolling(window=20).mean()
    
    data['EMA12'] = data['close'].ewm(span=12, adjust=False).mean()
    data['EMA26'] = data['close'].ewm(span=26, adjust=False).mean()
    
    data['Volatility_5'] = data['close'].rolling(window=5).std()
    data['Volatility_10'] = data['close'].rolling(window=10).std()
    
    data['Volume_MA5'] = data['vol'].rolling(window=5).mean()
    data['Volume_Change_Rate'] = data['vol'].pct_change()
    
    delta = data['close'].diff()
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)
    average_gain = gain.rolling(window=14).mean()
    average_loss = loss.rolling(window=14).mean()
    rs = average_gain / average_loss
    data['RSI14'] = 100 - (100 / (1 + rs))
    
    data['Momentum_3'] = data['close'].diff(3)
    data['Momentum_7'] = data['close'].diff(7)
    
    data['Middle_Band'] = data['close'].rolling(window=20).mean()
    data['Upper_Band'] = data['Middle_Band'] + 2 * data['close'].rolling(window=20).std()
    data['Lower_Band'] = data['Middle_Band'] - 2 * data['close'].rolling(window=20).std()
    
    data = handle_nan_and_inf(data)
    return data




# 创建和训练模型
nhits_params = {
    'sampling_stride': 8,
    'eval_metrics': ["mse", "mae"],
    'batch_size': 32,
    'max_epochs': 100,
    'patience': 10
}
rnn_params = {
    'sampling_stride': 8,
    'eval_metrics': ["mse", "mae"],
    'batch_size': 32,
    'max_epochs': 100,
    'patience': 10,
}
mlp_params = {
    'sampling_stride': 8,
    'eval_metrics': ["mse", "mae"],
    'batch_size': 32,
    'max_epochs': 100,
    'patience': 10,
    'use_bn': True,
}

reg = WeightingEnsembleForecaster(
    in_chunk_len=64,
    out_chunk_len=1,
    skip_chunk_len=0,
    estimators=[(NHiTSModel, nhits_params), (RNNBlockRegressor, rnn_params), (MLPRegressor, mlp_params)]
)

reg = reg.load(os.path.join(model_center, "low_high"))
# predicted = reg.recursive_predict(valid_tsdataset, 3)

# 遍历每个CSV文件
for csv_path in tqdm(csv_paths):
    new_data = pd.read_csv(csv_path)
    
    # 确保数据列是数值类型
    new_data[['open', 'high', 'low', 'close', 'pre_close', 'change', 'pct_chg', 'vol', 'amount']] = \
        new_data[['open', 'high', 'low', 'close', 'pre_close', 'change', 'pct_chg', 'vol', 'amount']].apply(pd.to_numeric, errors='coerce')
    
    if len(new_data) < 4096:
        continue

    new_data = new_data.iloc[::-1]  # 反转数据顺序
    new_data = new_data[-4096:-128]

    # 构建新的索引
    new_data['index_new'] = range(1, len(new_data) + 1)

    # 计算特征
    new_data = calculate_features(new_data)

    sum_dam_data.append(new_data)

# 合并所有数据
dam_data = pd.concat(sum_dam_data)
dam_data.reset_index(drop=True, inplace=True)

# 构建训练数据集
dataset = TSDataset.load_from_dataframe(
    dam_data,
    group_id='ts_code',
    time_col="index_new",
    target_cols=['open', 'high', 'low', 'close', 'pre_close', 'change', 'pct_chg', 'MA5', 'MA10', 'MA20', 
                 'EMA12', 'EMA26', 'Volatility_5', 'Volatility_10', 'Volume_MA5', 'Volume_Change_Rate', 
                 'RSI14', 'Momentum_3', 'Momentum_7', 'Middle_Band', 'Upper_Band', 'Lower_Band']
)

# 验证集数据处理
sum_dam_data = []
for csv_path in tqdm(csv_paths):
    new_data = pd.read_csv(csv_path)
    
    # 确保数据列是数值类型
    new_data[['open', 'high', 'low', 'close', 'pre_close', 'change', 'pct_chg', 'vol', 'amount']] = \
        new_data[['open', 'high', 'low', 'close', 'pre_close', 'change', 'pct_chg', 'vol', 'amount']].apply(pd.to_numeric, errors='coerce')
    
    if len(new_data) < 2048:
        continue

    new_data = new_data.iloc[::-1]  # 反转数据顺序
    new_data = new_data[-128:]

    new_data['index_new'] = range(1, len(new_data) + 1)

    # 计算特征
    new_data = calculate_features(new_data)

    sum_dam_data.append(new_data)

# 合并所有验证集数据
valid_dam_data = pd.concat(sum_dam_data)
valid_dam_data.reset_index(drop=True, inplace=True)

# 构建验证数据集
valid_tsdataset = TSDataset.load_from_dataframe(
    valid_dam_data,
    group_id='ts_code',
    time_col="index_new",
    target_cols=['open', 'high', 'low', 'close', 'pre_close', 'change', 'pct_chg', 'MA5', 'MA10', 'MA20', 
                 'EMA12', 'EMA26', 'Volatility_5', 'Volatility_10', 'Volume_MA5', 'Volume_Change_Rate', 
                 'RSI14', 'Momentum_3', 'Momentum_7', 'Middle_Band', 'Upper_Band', 'Lower_Band']
)

# 训练集数据标准化
scaler = StandardScaler()
scaler = scaler.fit(dataset)
dataset = scaler.transform(dataset)
valid_tsdataset = scaler.transform(valid_tsdataset)
reg
valid_tsdataset
a
import ray
# 验证集数据处理
a = []
sum_dam_data = []
# for csv_path in tqdm(csv_paths):
@ray.remote    
def csv_predict(csv_path):
    # 创建和训练模型
    nhits_params = {
        'sampling_stride': 8,
        'eval_metrics': ["mse", "mae"],
        'batch_size': 32,
        'max_epochs': 100,
        'patience': 10
    }
    rnn_params = {
        'sampling_stride': 8,
        'eval_metrics': ["mse", "mae"],
        'batch_size': 32,
        'max_epochs': 100,
        'patience': 10,
    }
    mlp_params = {
        'sampling_stride': 8,
        'eval_metrics': ["mse", "mae"],
        'batch_size': 32,
        'max_epochs': 100,
        'patience': 10,
        'use_bn': True,
    }

    reg = WeightingEnsembleForecaster(
        in_chunk_len=64,
        out_chunk_len=1,
        skip_chunk_len=0,
        estimators=[(NHiTSModel, nhits_params), (RNNBlockRegressor, rnn_params), (MLPRegressor, mlp_params)]
    )

    reg = reg.load(os.path.join(model_center, "low_high"))
    new_data = pd.read_csv(csv_path)
    
    # 确保数据列是数值类型
    new_data[['open', 'high', 'low', 'close', 'pre_close', 'change', 'pct_chg', 'vol', 'amount']] = \
        new_data[['open', 'high', 'low', 'close', 'pre_close', 'change', 'pct_chg', 'vol', 'amount']].apply(pd.to_numeric, errors='coerce')
    
    if len(new_data) < 2048:
        return {}

    new_data = new_data.iloc[::-1]  # 反转数据顺序
    new_data = new_data[-128:]

    new_data['index_new'] = range(1, len(new_data) + 1)

    # 计算特征
    new_data = calculate_features(new_data)

    # 构建验证数据集
    valid_tsdataset = TSDataset.load_from_dataframe(
        new_data,
        time_col="index_new",
        target_cols=['open', 'high', 'low', 'close', 'pre_close', 'change', 'pct_chg', 'MA5', 'MA10', 'MA20', 
                    'EMA12', 'EMA26', 'Volatility_5', 'Volatility_10', 'Volume_MA5', 'Volume_Change_Rate', 
                    'RSI14', 'Momentum_3', 'Momentum_7', 'Middle_Band', 'Upper_Band', 'Lower_Band']
    )
    valid_tsdataset = scaler.transform(valid_tsdataset)
    predicted = reg.recursive_predict(valid_tsdataset, 3)
    # print()
    predicted = scaler.inverse_transform(predicted)
    predicted = predicted.to_dataframe()
    high_value = predicted.max().to_dict()['high']
    low_value = predicted.min().to_dict()['low']
    round_value = round((high_value - low_value) / low_value, 3) * 1000
    high_index = predicted[predicted['high'] == high_value].index.values[0] - len(new_data)
    low_index = predicted[predicted['low'] == low_value].index.values[0] - len(new_data)
    json_path = csv_path.replace("csv", "json")
    return {
        "round_value": round_value,
        "stock_name": os.path.split(csv_path)[-1],
        "low_index": low_index,
        "low_value": round(low_value, 2),
        "high_index": high_index,
        "high_value": round(high_value, 2),
    }
    # print(a)

ray.shutdown()

ray.init()
a = []
for csv_path in tqdm(csv_paths):
    a.append(csv_predict.remote(csv_path))
result = ray.get(a)
ray.shutdown()
for valid_tsdata in valid_tsdataset:
    try:

        predicted = reg.predict(valid_tsdata)
        print(predicted)
    except Exception as e:
        print(e)
        continue
predicted