【特征工程

编程入门行业动态更新时间:2024-10-10 06:13:51

【<a href=https://www.elefans.com/category/jswz/34/1769701.html style= 特征工程"/>

【特征工程

机器学习时间特征处理方法汇总

时间特征基本处理
基本处理基础上二次处理
差分、滞后、滑窗、指数加权
参考文献：

时间特征基本处理

# 时间特征的基本转换
class HandleDateFeature:def __init__(self, df, col):self.df = dfself.col = coldef datetime_transfrom(self):df[self.col + '_year'] = df[self.col].dt.yeardf[self.col + '_quarter'] = df[self.col].dt.quarterdf[self.col + '_month'] = df[self.col].dt.monthdf[self.col + '_week'] = df[self.col].dt.weekofyeardf[self.col + '_day'] = df[self.col].dt.dayofyeardf[self.col + '_month_day'] = df[self.col].dt.daydf[self.col + '_dayofweek'] = df[self.col].dt.dayofweekdf[self.col + '_hour'] = df[self.col].dt.hourdf[self.col + '_minute'] = df[self.col].dt.minutedf[self.col + '_second'] = df[self.col].dt.secondreturn dfdef date_isbegin(self):df[self.col + '_year_start'] = df[self.col].dt.is_year_startdf[self.col + '_year_end'] = df[self.col].dt.is_year_enddf[self.col + '_quarter_start'] = df[self.col].dt.is_quarter_startdf[self.col + '_quarter_end'] = df[self.col].dt.is_quarter_enddf[self.col + '_month_start'] = df[self.col].dt.is_month_startdf[self.col + '_month_end'] = df[self.col].dt.is_month_endreturn df

基本处理基础上二次处理

class TwoHandleDateFeature:def __init__(self, df, col):self.df = dfself.col = col# 统计年、月、季度、天、时、分、秒、是否周末 周期特征def cycle_fea(self):df[self.col + '_sin_month'] = df[self.col + '_month'].apply(lambda x: np.sin(np.pi*2/12 * x))df[self.col + '_cos_month'] = df[self.col + '_month'].apply(lambda x: np.cos(np.pi * 2 / 12 * x))df[self.col + '_sin_month_day'] = df[self.col + '_month_day'].apply(lambda x: np.sin(np.pi * 2 / 30 * x))df[self.col + '_cos_month_day'] = df[self.col + '_month_day'].apply(lambda x: np.cos(np.pi * 2 / 30 * x))df[self.col + '_sin_week'] = df[self.col + '_week'].apply(lambda x: np.sin(np.pi * 2 / 52 * x))df[self.col + '_cos_week'] = df[self.col + '_week'].apply(lambda x: np.cos(np.pi * 2 / 52 * x))df[self.col + '_sin_day'] = df[self.col + '_day'].apply(lambda x: np.sin(np.pi * 2 / 365 * x))df[self.col + '_cos_day'] = df[self.col + '_day'].apply(lambda x: np.cos(np.pi * 2 / 365 * x))df[self.col + '_sin_dayofweek'] = df[self.col + '_dayofweek'].apply(lambda x: np.sin(np.pi * 2 / 7 * x))df[self.col + '_cos_dayofweek'] = df[self.col + '_dayofweek'].apply(lambda x: np.cos(np.pi * 2 / 7 * x))df[self.col + '_sin_hour'] = df[self.col + '_hour'].apply(lambda x: np.sin(np.pi * 2 / 24 * x))df[self.col + '_cos_hour'] = df[self.col + '_hour'].apply(lambda x: np.cos(np.pi * 2 / 24 * x))df[self.col + '_sin_minute'] = df[self.col + '_minute'].apply(lambda x: np.sin(np.pi * 2 / 60 * x))df[self.col + '_cos_minute'] = df[self.col + '_minute'].apply(lambda x: np.sin(np.pi * 2 / 60 * x))df[self.col + '_weekend'] = np.where(df[self.col + '_dayofweek'].isin([5, 6]), 1, 0)return df# 工作、努力、卷王、黎明、清晨、早上、上午、中午、下午、傍晚、晚上、深夜  划分def part_day(self):df[self.col + '_work_hours'] = np.where(df[self.col + '_hour'].isin([9, 10, 11, 14, 15, 17]), 1, 0)df[self.col + '_early_bird__hours'] = np.where(df[self.col + '_hour'].isin([8, 18]), 1, 0)df[self.col + '_blackleg__hours'] = np.where(df[self.col + '_hour'].isin([7, 19, 20, 21]), 1, 0)df[self.col + '_dawn_hours'] = np.where(df[self.col + '_hour'].isin([4, 5]), 1, 0)df[self.col + '_early_morning_hours'] = np.where(df[self.col + '_hour'].isin([6, 7]), 1, 0)df[self.col + '_later_morning_hours'] = np.where(df[self.col + '_hour'].isin([8, 9, 10]), 1, 0)df[self.col + '_noon_hours'] = np.where(df[self.col + '_hour'].isin([11, 12, 13]), 1, 0)df[self.col + '_afternoon_hours'] = np.where(df[self.col + '_hour'].isin([14, 15, 16]), 1, 0)df[self.col + '_evening_hours'] = np.where(df[self.col + '_hour'].isin([17, 18, 19]), 1, 0)df[self.col + '_night_hours'] = np.where(df[self.col + '_hour'].isin([20, 21, 22]), 1, 0)df[self.col + '_midnight_hours'] = np.where(df[self.col + '_hour'].isin([23, 24, 1, 2, 3]), 1, 0)return df

差分、滞后、滑窗、指数加权

class DateShiftRollingEwm:def __init__(self, df, col,  group, windows=None, alpha=None, shift=None):self.df = dfself.col = colself.shift = shiftself.windows = windowsself.alpha = alpha# group---> listself.group = groupdef diffs(self):df[self.col + '_diff_1'] = df.groupby(self.group)[self.col].diff(periods=1)df[self.col + '_diff_2'] = df.groupby(self.group)[self.col + '_diff_1'].diff(periods=1)return dfdef shifts(self):df[self.col + '_shift_' + str(self.shift)] = df.groupby(self.group)[self.col].shift(self.shift).fillna(method='ffill').reset_index().sort_index().set_index('index')return dfdef rollings(self):"""DataFrame.rolling(window, min_periods=None, freq=None, center=False, win_type=None, on=None, axis=0,closed=None)- min_periods: 最少需要有值的观测点的数量, 对于int类型，默认与window相等- center: 把窗口的标签设置为居中, 布尔型, 默认False- win_type: 窗口的类型, 截取窗的各种函数。字符串类型，默认为None- on: 可选参数, 对于dataframe而言，指定要计算滚动窗口的列, 值为列名- closed：定义区间的开闭，支持int类型的window, 对于offset类型默认是左开右闭的即默认为right, 可以根据情况指定为left、both等- axis：方向（轴）, 一般都是0"""df[self.col + '_rolling_' + str(self.windows) + '_mean'] = df.groupby(self.group)[self.col].transform(lambda x: x.shift(1).rolling(window=self.windows, min_periods=3, win_type="triang").mean()).values.tolist()df[self.col + '_rolling_' + str(self.windows) + '_max'] = df.groupby(self.group)[self.col].transform(lambda x: x.shift(1).rolling(window=self.windows, min_periods=3).max()).values.tolist()df[self.col + '_rolling_' + str(self.windows) + '_min'] = df.groupby(self.group)[self.col].transform(lambda x: x.shift(1).rolling(window=self.windows, min_periods=3).min()).values.tolist()
#         df[self.col + '_rolling_' + str(self.windows) + '_std'] = df.groupby(self.group)[self.col].transform(
#             lambda x: x.shift(1).rolling(window=self.windows, min_periods=3, win_type="triang").std()).values.tolist()
#         df[self.col + '_rolling_' + str(self.windows) + '_skew'] = df.groupby(self.group)[self.col].transform(
#             lambda x: x.shift(1).rolling(window=self.windows, min_periods=3).skew()).values.tolist()
#         df[self.col + '_rolling_' + str(self.windows) + '_kurt'] = df.groupby(self.group)[self.col].transform(
#             lambda x: x.shift(1).rolling(window=self.windows, min_periods=3).kurt()).values.tolist()
#         df[self.col + '_rolling_' + str(self.windows) + '_quantile'] = df.groupby(self.group)[self.col].transform(
#             lambda x: x.rolling(window=self.windows, min_periods=3).quantile()).values.tolist()
#         df[self.col + '_rolling_' + str(self.windows) + '_corr'] = df.groupby(self.group)[self.col].transform(
#             lambda x: x.shift(1).rolling(window=self.windows, min_periods=3).corr()).values.tolist()return dfdef ewms(self):"""DataFrame.ewm(self, com=None, span=None, halflife=None, alpha=None, min_periods=0, adjust=True, ignore_na=False, axis=0)com ：  float，可选,根据质心指定衰减， α=1/(1+com), for com≥0。span ：  float，可选,根据范围指定衰减， α=2/(span+1), for span≥1。halflife ：  float，可选,根据半衰期指定衰减， α=1−exp(log(0.5)/halflife),forhalflife>0。alpha ：  float，可选,直接指定平滑系数α， 0<α≤1。0.18.0版中的新功能。min_periods ： int，默认0,窗口中具有值的最小观察数（否则结果为NA）。adjust ： bool，默认为True,除以开始阶段的衰减调整因子，以解释相对权重的不平衡(将EWMA视为移动平均线)。ignore_na ： bool，默认为False,计算权重时忽略缺失值；指定True可重现0.15.0之前的行为。axis ： {0或'index'，1或'columns'}，默认0,要使用的轴。值0标识行，值1标识列。只能提供一个参数： com, span, halflife, 和 alpha 四个参数中有且仅有一个参数可被设置（不支持2个或2个以上的设置）。可供使用指数加权函数有：mean(), var(), std(), corr(), cov()"""df[self.col + '_ewm_' + str(self.windows) + '_mean'] = df.groupby(self.group)[self.col].transform(lambda x: x.shift(1).ewm(alpha=self.alpha).mean()).values.tolist()df[self.col + '_ewm_' + str(self.windows) + '_std'] = df.groupby(self.group)[self.col].transform(lambda x: x.shift(1).ewm(alpha=self.alpha).std()).values.tolist()df[self.col + '_ewm_' + str(self.windows) + '_corr'] = df.groupby(self.group)[self.col].transform(lambda x: x.shift(1).ewm(alpha=self.alpha).corr()).values.tolist()return df