基于历史各项数据预计未来三个月销售额情况
import os.path
import pandas as pd
import numpy as np
import xgboost as xgb
base_path = "./data/"
数据读取
# 数据读取
navigation = pd.read_csv(os.path.join(base_path, 'navigation.csv'))
sales = pd.read_csv(os.path.join(base_path, 'sales.csv'))
train = pd.read_csv(os.path.join(base_path, 'train.csv'))
test = pd.read_csv(os.path.join(base_path, 'test.csv'))
vimages = pd.read_csv(os.path.join(base_path, 'vimages.csv'))
test.head()
| ID | month | sku_hash | product_type | product_gender | macro_function | function | sub_function | model | aesthetic_sub_line | macro_material | color | fr_FR_price | en_US_description |
---|
0 | ed4c7471eac7e8c6e6718364c2b6e75462eeb47c_1 | 1 | ed4c7471eac7e8c6e6718364c2b6e75462eeb47c | Accessories | Women | TEXTILES | SCARVES | WINTER SCARVES | ECHARPE REYKJAVIK | SANS LIGNE ESTHETIQUE | TEXTILE | NaN | 650.0 | In pure, soft cashmere, the Reykjavik scarf is... |
---|
1 | 26b2c4f6281482cccf1e748ef388f1649ecf1c8b_1 | 1 | 26b2c4f6281482cccf1e748ef388f1649ecf1c8b | Leather Goods | Women | CITY BAGS | DAILY BAGS | HANDBAGS | VERY ZIPPED TOTE | VERY | LEATHER | KAKI FANGO COQUILLE | 2550.0 | For Fall-Winter 2018, the Very Zipped Tote han... |
---|
2 | 00aad36ebc04aaa761df219c1b48d1e6162d28bb_1 | 1 | 00aad36ebc04aaa761df219c1b48d1e6162d28bb | Leather Goods | Women | CITY BAGS | DAILY BAGS | CROSS SHOULDER BAGS | HINA | MAHINA | LEATHER | NOIR | 2460.0 | The Hina is two bags in one: gather in the sid... |
---|
3 | 8281636e7d162c92c1a3c1f9e40afb05d39a02f9_1 | 1 | 8281636e7d162c92c1a3c1f9e40afb05d39a02f9 | Accessories | Men | TEXTILES | STOLES | WINTER STOLES | ETOLE MONOGRAM NIGHTFALL | SANS LIGNE ESTHETIQUE | TEXTILE | NaN | 455.0 | With its subtle tone-on-tone Monogram pattern ... |
---|
4 | b76f8505700409a198fe69fb33e5d6cc01166c9b_1 | 1 | b76f8505700409a198fe69fb33e5d6cc01166c9b | Accessories | Men | TIES | TIES | OTHER TIES | CRAVATE MONOGRAM ILLUSION | SANS LIGNE ESTHETIQUE | TEXTILE | NaN | 150.0 | Suitable both for business and special occasio... |
---|
vimages.head()
| sku_hash | dim_1 | dim_2 | dim_3 | dim_4 | dim_5 | dim_6 | dim_7 | dim_8 | dim_9 | ... | dim_2039 | dim_2040 | dim_2041 | dim_2042 | dim_2043 | dim_2044 | dim_2045 | dim_2046 | dim_2047 | dim_2048 |
---|
0 | 3f6eba29c2c71cc804edb9a4a88760a273dd4348 | 0.000000 | 0.165251 | 0.017061 | 0.0 | 0.097332 | 0.000000 | 0.008793 | 0.064529 | 0.067048 | ... | 0.075003 | 0.000000 | 0.319221 | 0.756835 | 0.286287 | 0.000000 | 0.000000 | 0.000000 | 1.051608 | 0.677220 |
---|
1 | d7681062d1826c06aeca716e0aff6d5c096ac6d8 | 0.126899 | 0.000000 | 0.041690 | 0.0 | 0.600556 | 0.556608 | 0.608440 | 0.896061 | 0.011975 | ... | 0.000000 | 0.010133 | 0.325223 | 0.006643 | 0.169298 | 0.317276 | 0.308293 | 0.086837 | 1.122189 | 0.074612 |
---|
2 | c594b658ce0f17ea594a50426dcf69468b83bdef | 0.218891 | 0.444878 | 0.023655 | 0.0 | 0.933365 | 0.000000 | 0.413172 | 0.020560 | 0.010038 | ... | 0.075254 | 0.000000 | 0.037003 | 0.404053 | 0.301876 | 0.049935 | 0.031618 | 0.138646 | 0.697306 | 0.075182 |
---|
3 | 5b7dbbfcc0eba1037e6ab3c9e5cf3ea819deba65 | 0.002935 | 0.000000 | 0.504765 | 0.0 | 0.391393 | 0.159187 | 2.894631 | 0.708697 | 0.485973 | ... | 0.703062 | 0.000000 | 0.002509 | 0.000000 | 0.441945 | 0.850916 | 0.043980 | 0.000000 | 0.112908 | 0.618085 |
---|
4 | 3d3487afe62434c742af37cf935a726f5bb16251 | 0.000000 | 0.030289 | 0.084332 | 0.0 | 0.416195 | 0.000000 | 0.487941 | 0.154408 | 0.052589 | ... | 0.054207 | 0.000000 | 0.000000 | 0.000000 | 0.118480 | 0.000000 | 0.000000 | 0.073685 | 0.000000 | 0.114883 |
---|
5 rows × 2049 columns
train.head()
| ID | month | sku_hash | product_type | product_gender | macro_function | function | sub_function | model | aesthetic_sub_line | macro_material | color | fr_FR_price | en_US_description | target |
---|
0 | 3d8a4ae769b526187c36901f204691a663333fa4_1 | 1 | 3d8a4ae769b526187c36901f204691a663333fa4 | Leather Goods | Women | LG ACCESSORIES | SMALL LEATHER GOODS | COMPACT WALLETS | PF.VICTORINE | DAMIER AZUR | CANVAS | AZUR | 370.0 | Fashioned from summery Damier Azur canvas, thi... | 1366.0 |
---|
1 | c05a54f7067be054ec4b27d0d6081353ef7d9df6_1 | 1 | c05a54f7067be054ec4b27d0d6081353ef7d9df6 | Accessories | Women | FANCY ACCESSORIES | BAG CHARMS | BAG CHARMS | BAG CHARM PETITE MALLE | SANS LIGNE ESTHETIQUE | CANVAS | NaN | 295.0 | This collectible piece features our iconic Pet... | 526.0 |
---|
2 | 7cb4d3626bd48a9b523d8693266219c34aeccde8_1 | 1 | 7cb4d3626bd48a9b523d8693266219c34aeccde8 | Leather Goods | Women | CITY BAGS | DAILY BAGS | SHOULDER BAGS | GIROLATA | MAHINA | LEATHER | GALET | 2660.0 | The perfect summer companion, this effortlessl... | 503.0 |
---|
3 | 8e5967f90ff9fb93aa2840fceecc537b2e8e6ff3_1 | 1 | 8e5967f90ff9fb93aa2840fceecc537b2e8e6ff3 | Leather Goods | Women | CITY BAGS | DAILY BAGS | SHOULDER BAGS | GIROLATA | MAHINA | LEATHER | MAGNOLIA | 2660.0 | The perfect summer companion, this effortlessl... | 276.0 |
---|
4 | 6913a128945e0efeafc52101dcdeaa610eaa4430_1 | 1 | 6913a128945e0efeafc52101dcdeaa610eaa4430 | Accessories | Men | FASHION JEWELRY | NECKLACES | NECKLACES | COLL.CHARMS FOR GENTLEMEN | SANS LIGNE ESTHETIQUE | NON PRECIOUS METAL | NaN | 325.0 | This distinctive charm necklace reinterprets d... | 206.0 |
---|
navigation.head()
| Date | sku_hash | day_visit | month_visit | website_version_zone_number | website_version_country_number | traffic_source | page_views | addtocart |
---|
0 | Day 6 | b6cb56a971c9ffeeaf1833202ab453b51ab680a9 | Wednesday | 9 | 5.0 | 32.0 | Source 1 | 105 | 0 |
---|
1 | Day 4 | 9c5060f9fb648cdfafd66f170af517f4de0a9162 | Monday | 9 | 5.0 | 32.0 | Source 1 | 314 | 0 |
---|
2 | Day 4 | da1549af83c813e5b53a4e4c0d588fa0099e3d3d | Monday | 10 | 5.0 | 32.0 | Source 1 | 57 | 0 |
---|
3 | Day 6 | 477ac04baf2a484f441ffe688abc89bb46a1c21d | Wednesday | 10 | 5.0 | 32.0 | Source 1 | 547 | 0 |
---|
4 | Day 4 | 799bd8691fe0be0137173eadd5e79d033a389bd8 | Monday | 10 | 5.0 | 32.0 | Source 2 | 43 | 0 |
---|
sales.head()
| Date | day_transaction_date | Month_transaction | type | zone_number | country_number | name | sku_hash | sales_quantity | currency_rate_USD | ... | NetSentiment_6_day_before | PositiveSentiment_6_day_before | NegativeSentiment_6_day_before | Impressions_6_day_before | TotalBuzzPost_7_day_before | TotalBuzz_7_day_before | NetSentiment_7_day_before | PositiveSentiment_7_day_before | NegativeSentiment_7_day_before | Impressions_7_day_before |
---|
0 | Day_7 | Thursday | 4 | Type_2 | 4 | 43 | Name_1 | 84d4e90b67c2b9902cff522be4eb121f637c17b5 | 5 | 1.225636 | ... | 77.525599 | 33634.0 | 4258.0 | 643226023.0 | 68960.0 | 107970.0 | 70.473656 | 32032.0 | 5548.0 | 631477319.0 |
---|
1 | Day_6 | Wednesday | 4 | Type_1 | 4 | 20 | Name_1 | 84d4e90b67c2b9902cff522be4eb121f637c17b5 | 5 | 1.228951 | ... | 70.473656 | 32032.0 | 5548.0 | 631477319.0 | 65312.0 | 102920.0 | 71.788703 | 34277.0 | 5629.0 | 730132649.0 |
---|
2 | Day_1 | Monday | 1 | Type_2 | 2 | 65 | Name_1 | 70f3dd559074ecfc9d69ff64480bc8acae977e3a | 5 | 1.201496 | ... | 46.158532 | 27594.0 | 10165.0 | 359205211.0 | 45746.0 | 64881.0 | 73.083627 | 21349.0 | 3320.0 | 268925766.0 |
---|
3 | Day_7 | Sunday | 1 | Type_1 | 3 | 49 | Name_1 | 70f3dd559074ecfc9d69ff64480bc8acae977e3a | 5 | 1.204091 | ... | 72.438611 | 16292.0 | 2604.0 | 307167067.0 | 42796.0 | 61653.0 | 68.744442 | 20872.0 | 3866.0 | 323820589.0 |
---|
4 | Day_5 | Tuesday | 2 | Type_1 | 4 | 20 | Name_1 | 1f8b7a2b42d4cdf5eef70f75fb86a34d514d2747 | 5 | 1.238391 | ... | 72.776593 | 28344.0 | 4466.0 | 440849942.0 | 60653.0 | 90848.0 | 75.338753 | 26527.0 | 3731.0 | 393463263.0 |
---|
5 rows × 97 columns
获取不同颜色产品的平均指标
product_descriptor = ['product_type', 'product_gender', 'macro_function',
'function', 'sub_function', 'model', 'aesthetic_sub_line', 'macro_material',
'month']
product_target_sum = train.groupby(product_descriptor)['target'].sum().reset_index(name = 'sum_target')
product_target_count = train.groupby(product_descriptor)['target'].count().reset_index(name = 'count_target')
product_target_stats = pd.merge(product_target_sum,product_target_count,on=product_descriptor)
train = train.merge(product_target_stats,on=product_descriptor,how='left')
test = test.merge(product_target_stats,on=product_descriptor,how='left')
train['mean_target'] = (train['sum_target'] - train['target'])/(train['count_target']-1)
test['mean_target'] = test['sum_target']/test['count_target']
train.drop(['count_target','sum_target'],axis=1) #这块需要指定inplace 没指定相当于打印不改变DF
test.drop(['count_target','sum_target'],axis=1).head()
| ID | month | sku_hash | product_type | product_gender | macro_function | function | sub_function | model | aesthetic_sub_line | macro_material | color | fr_FR_price | en_US_description | mean_target |
---|
0 | ed4c7471eac7e8c6e6718364c2b6e75462eeb47c_1 | 1 | ed4c7471eac7e8c6e6718364c2b6e75462eeb47c | Accessories | Women | TEXTILES | SCARVES | WINTER SCARVES | ECHARPE REYKJAVIK | SANS LIGNE ESTHETIQUE | TEXTILE | NaN | 650.0 | In pure, soft cashmere, the Reykjavik scarf is... | 782.0 |
---|
1 | 26b2c4f6281482cccf1e748ef388f1649ecf1c8b_1 | 1 | 26b2c4f6281482cccf1e748ef388f1649ecf1c8b | Leather Goods | Women | CITY BAGS | DAILY BAGS | HANDBAGS | VERY ZIPPED TOTE | VERY | LEATHER | KAKI FANGO COQUILLE | 2550.0 | For Fall-Winter 2018, the Very Zipped Tote han... | 179.0 |
---|
2 | 00aad36ebc04aaa761df219c1b48d1e6162d28bb_1 | 1 | 00aad36ebc04aaa761df219c1b48d1e6162d28bb | Leather Goods | Women | CITY BAGS | DAILY BAGS | CROSS SHOULDER BAGS | HINA | MAHINA | LEATHER | NOIR | 2460.0 | The Hina is two bags in one: gather in the sid... | NaN |
---|
3 | 8281636e7d162c92c1a3c1f9e40afb05d39a02f9_1 | 1 | 8281636e7d162c92c1a3c1f9e40afb05d39a02f9 | Accessories | Men | TEXTILES | STOLES | WINTER STOLES | ETOLE MONOGRAM NIGHTFALL | SANS LIGNE ESTHETIQUE | TEXTILE | NaN | 455.0 | With its subtle tone-on-tone Monogram pattern ... | NaN |
---|
4 | b76f8505700409a198fe69fb33e5d6cc01166c9b_1 | 1 | b76f8505700409a198fe69fb33e5d6cc01166c9b | Accessories | Men | TIES | TIES | OTHER TIES | CRAVATE MONOGRAM ILLUSION | SANS LIGNE ESTHETIQUE | TEXTILE | NaN | 150.0 | Suitable both for business and special occasio... | NaN |
---|
train.head()
| ID | month | sku_hash | product_type | product_gender | macro_function | function | sub_function | model | aesthetic_sub_line | macro_material | color | fr_FR_price | en_US_description | target | sum_target | count_target | mean_target |
---|
0 | 3d8a4ae769b526187c36901f204691a663333fa4_1 | 1 | 3d8a4ae769b526187c36901f204691a663333fa4 | Leather Goods | Women | LG ACCESSORIES | SMALL LEATHER GOODS | COMPACT WALLETS | PF.VICTORINE | DAMIER AZUR | CANVAS | AZUR | 370.0 | Fashioned from summery Damier Azur canvas, thi... | 1366.0 | 1837.0 | 2 | 471.000000 |
---|
1 | c05a54f7067be054ec4b27d0d6081353ef7d9df6_1 | 1 | c05a54f7067be054ec4b27d0d6081353ef7d9df6 | Accessories | Women | FANCY ACCESSORIES | BAG CHARMS | BAG CHARMS | BAG CHARM PETITE MALLE | SANS LIGNE ESTHETIQUE | CANVAS | NaN | 295.0 | This collectible piece features our iconic Pet... | 526.0 | 526.0 | 1 | NaN |
---|
2 | 7cb4d3626bd48a9b523d8693266219c34aeccde8_1 | 1 | 7cb4d3626bd48a9b523d8693266219c34aeccde8 | Leather Goods | Women | CITY BAGS | DAILY BAGS | SHOULDER BAGS | GIROLATA | MAHINA | LEATHER | GALET | 2660.0 | The perfect summer companion, this effortlessl... | 503.0 | 1395.0 | 4 | 297.333333 |
---|
3 | 8e5967f90ff9fb93aa2840fceecc537b2e8e6ff3_1 | 1 | 8e5967f90ff9fb93aa2840fceecc537b2e8e6ff3 | Leather Goods | Women | CITY BAGS | DAILY BAGS | SHOULDER BAGS | GIROLATA | MAHINA | LEATHER | MAGNOLIA | 2660.0 | The perfect summer companion, this effortlessl... | 276.0 | 1395.0 | 4 | 373.000000 |
---|
4 | 6913a128945e0efeafc52101dcdeaa610eaa4430_1 | 1 | 6913a128945e0efeafc52101dcdeaa610eaa4430 | Accessories | Men | FASHION JEWELRY | NECKLACES | NECKLACES | COLL.CHARMS FOR GENTLEMEN | SANS LIGNE ESTHETIQUE | NON PRECIOUS METAL | NaN | 325.0 | This distinctive charm necklace reinterprets d... | 206.0 | 206.0 | 1 | NaN |
---|
train.drop(['count_target','sum_target'],axis=1,inplace =True)
test.drop(['count_target','sum_target'],axis=1,inplace =True)
train.head()
| ID | month | sku_hash | product_type | product_gender | macro_function | function | sub_function | model | aesthetic_sub_line | macro_material | color | fr_FR_price | en_US_description | target | mean_target |
---|
0 | 3d8a4ae769b526187c36901f204691a663333fa4_1 | 1 | 3d8a4ae769b526187c36901f204691a663333fa4 | Leather Goods | Women | LG ACCESSORIES | SMALL LEATHER GOODS | COMPACT WALLETS | PF.VICTORINE | DAMIER AZUR | CANVAS | AZUR | 370.0 | Fashioned from summery Damier Azur canvas, thi... | 1366.0 | 471.000000 |
---|
1 | c05a54f7067be054ec4b27d0d6081353ef7d9df6_1 | 1 | c05a54f7067be054ec4b27d0d6081353ef7d9df6 | Accessories | Women | FANCY ACCESSORIES | BAG CHARMS | BAG CHARMS | BAG CHARM PETITE MALLE | SANS LIGNE ESTHETIQUE | CANVAS | NaN | 295.0 | This collectible piece features our iconic Pet... | 526.0 | NaN |
---|
2 | 7cb4d3626bd48a9b523d8693266219c34aeccde8_1 | 1 | 7cb4d3626bd48a9b523d8693266219c34aeccde8 | Leather Goods | Women | CITY BAGS | DAILY BAGS | SHOULDER BAGS | GIROLATA | MAHINA | LEATHER | GALET | 2660.0 | The perfect summer companion, this effortlessl... | 503.0 | 297.333333 |
---|
3 | 8e5967f90ff9fb93aa2840fceecc537b2e8e6ff3_1 | 1 | 8e5967f90ff9fb93aa2840fceecc537b2e8e6ff3 | Leather Goods | Women | CITY BAGS | DAILY BAGS | SHOULDER BAGS | GIROLATA | MAHINA | LEATHER | MAGNOLIA | 2660.0 | The perfect summer companion, this effortlessl... | 276.0 | 373.000000 |
---|
4 | 6913a128945e0efeafc52101dcdeaa610eaa4430_1 | 1 | 6913a128945e0efeafc52101dcdeaa610eaa4430 | Accessories | Men | FASHION JEWELRY | NECKLACES | NECKLACES | COLL.CHARMS FOR GENTLEMEN | SANS LIGNE ESTHETIQUE | NON PRECIOUS METAL | NaN | 325.0 | This distinctive charm necklace reinterprets d... | 206.0 | NaN |
---|
统计同款的数量
count_vec_cols = ['macro_function', 'function', 'sub_function', 'model',
'aesthetic_sub_line', 'macro_material', 'color']
for col in count_vec_cols:
tmp = pd.DataFrame({'sku_hash':pd.concat([train['sku_hash'],test['sku_hash']]),col:pd.concat([train[col],test[col]])})
tmp = pd.DataFrame(tmp.groupby(col)['sku_hash'].count()).reset_index()
tmp.columns = [col,col+'_count']
train = train.merge(tmp,on=col,how='left')
test = test.merge(tmp,on=col,how='left')
train.head()
| ID | month | sku_hash | product_type | product_gender | macro_function | function | sub_function | model | aesthetic_sub_line | ... | en_US_description | target | mean_target | macro_function_count | function_count | sub_function_count | model_count | aesthetic_sub_line_count | macro_material_count | color_count |
---|
0 | 3d8a4ae769b526187c36901f204691a663333fa4_1 | 1 | 3d8a4ae769b526187c36901f204691a663333fa4 | Leather Goods | Women | LG ACCESSORIES | SMALL LEATHER GOODS | COMPACT WALLETS | PF.VICTORINE | DAMIER AZUR | ... | Fashioned from summery Damier Azur canvas, thi... | 1366.0 | 471.000000 | 2292 | 2094 | 447 | 90 | 126 | 3213 | 120.0 |
---|
1 | c05a54f7067be054ec4b27d0d6081353ef7d9df6_1 | 1 | c05a54f7067be054ec4b27d0d6081353ef7d9df6 | Accessories | Women | FANCY ACCESSORIES | BAG CHARMS | BAG CHARMS | BAG CHARM PETITE MALLE | SANS LIGNE ESTHETIQUE | ... | This collectible piece features our iconic Pet... | 526.0 | NaN | 546 | 126 | 102 | 3 | 3477 | 3213 | NaN |
---|
2 | 7cb4d3626bd48a9b523d8693266219c34aeccde8_1 | 1 | 7cb4d3626bd48a9b523d8693266219c34aeccde8 | Leather Goods | Women | CITY BAGS | DAILY BAGS | SHOULDER BAGS | GIROLATA | MAHINA | ... | The perfect summer companion, this effortlessl... | 503.0 | 297.333333 | 3039 | 2784 | 699 | 15 | 90 | 4089 | 39.0 |
---|
3 | 8e5967f90ff9fb93aa2840fceecc537b2e8e6ff3_1 | 1 | 8e5967f90ff9fb93aa2840fceecc537b2e8e6ff3 | Leather Goods | Women | CITY BAGS | DAILY BAGS | SHOULDER BAGS | GIROLATA | MAHINA | ... | The perfect summer companion, this effortlessl... | 276.0 | 373.000000 | 3039 | 2784 | 699 | 15 | 90 | 4089 | 60.0 |
---|
4 | 6913a128945e0efeafc52101dcdeaa610eaa4430_1 | 1 | 6913a128945e0efeafc52101dcdeaa610eaa4430 | Accessories | Men | FASHION JEWELRY | NECKLACES | NECKLACES | COLL.CHARMS FOR GENTLEMEN | SANS LIGNE ESTHETIQUE | ... | This distinctive charm necklace reinterprets d... | 206.0 | NaN | 843 | 171 | 162 | 3 | 3477 | 1023 | NaN |
---|
5 rows × 23 columns
统计不同访问源的数量
traffic_source_views = navigation.groupby(['sku_hash','traffic_source'])['page_views'].sum().reset_index()
traffic_source_views[:5]
| sku_hash | traffic_source | page_views |
---|
0 | 000cb631113e2f54ca5512139a6592e9584957aa | Source 1 | 1748 |
---|
1 | 000cb631113e2f54ca5512139a6592e9584957aa | Source 2 | 40 |
---|
2 | 000cb631113e2f54ca5512139a6592e9584957aa | Source 5 | 30 |
---|
3 | 000cb631113e2f54ca5512139a6592e9584957aa | Source 6 | 48 |
---|
4 | 0020d561eab8b88ab55dfde84a2f12b865e5e0b4 | Source 1 | 16300 |
---|
traffic_source_views = traffic_source_views.pivot(index = 'sku_hash',columns = 'traffic_source',values = 'page_views').reset_index()
traffic_source_views.columns = ['sku_hash',
'page_views_nav1', 'page_views_nav2', 'page_views_nav3',
'page_views_nav4', 'page_views_nav5', 'page_views_nav6']
traffic_source_views.head()
| sku_hash | page_views_nav1 | page_views_nav2 | page_views_nav3 | page_views_nav4 | page_views_nav5 | page_views_nav6 |
---|
0 | 000cb631113e2f54ca5512139a6592e9584957aa | 1748.0 | 40.0 | NaN | NaN | 30.0 | 48.0 |
---|
1 | 0020d561eab8b88ab55dfde84a2f12b865e5e0b4 | 16300.0 | 286.0 | NaN | 25.0 | 93.0 | 25.0 |
---|
2 | 0026e7a0fcfe5999a44b70b1acaff00fc1ad3ac2 | 147.0 | 5.0 | NaN | NaN | 5.0 | NaN |
---|
3 | 00287bbb94c12066df6491dccd744ee87ff01a90 | 3473.0 | 15.0 | NaN | 25.0 | NaN | NaN |
---|
4 | 003f8a76cb823eb7c58b6d052c57a8933f9275fd | 47468.0 | 251.0 | NaN | 55.0 | NaN | 10.0 |
---|
统计不同类型的销售数量
type_sales = sales.groupby(['sku_hash','type'])['sales_quantity'].sum().reset_index()
type_sales = type_sales.pivot(index = 'sku_hash',columns = 'type',values = 'sales_quantity').reset_index()
type_sales.columns = ['sku_hash', 'sales_quantity_type1', 'sales_quantity_type2']
type_sales.head()
| sku_hash | sales_quantity_type1 | sales_quantity_type2 |
---|
0 | 000cb631113e2f54ca5512139a6592e9584957aa | 104.0 | 10.0 |
---|
1 | 0020d561eab8b88ab55dfde84a2f12b865e5e0b4 | 325.0 | 40.0 |
---|
2 | 0026e7a0fcfe5999a44b70b1acaff00fc1ad3ac2 | 85.0 | 25.0 |
---|
3 | 00287bbb94c12066df6491dccd744ee87ff01a90 | 35.0 | 10.0 |
---|
4 | 003f8a76cb823eb7c58b6d052c57a8933f9275fd | 253.0 | 145.0 |
---|
统计不同地区情况
zone_sales = sales.groupby(['sku_hash','zone_number'])['sales_quantity'].sum().reset_index()
zone_sales = zone_sales.pivot(index = 'sku_hash',columns = 'zone_number',values = 'sales_quantity').reset_index()
zone_sales.columns = ['sku_hash',
'sales_quantity_zone1', 'sales_quantity_zone2', 'sales_quantity_zone3',
'sales_quantity_zone4', 'sales_quantity_zone5']
zone_sales.head()
| sku_hash | sales_quantity_zone1 | sales_quantity_zone2 | sales_quantity_zone3 | sales_quantity_zone4 | sales_quantity_zone5 |
---|
0 | 000cb631113e2f54ca5512139a6592e9584957aa | 20.0 | 5.0 | 15.0 | 50.0 | 24.0 |
---|
1 | 0020d561eab8b88ab55dfde84a2f12b865e5e0b4 | NaN | NaN | 82.0 | 20.0 | 263.0 |
---|
2 | 0026e7a0fcfe5999a44b70b1acaff00fc1ad3ac2 | 55.0 | 35.0 | 10.0 | NaN | 10.0 |
---|
3 | 00287bbb94c12066df6491dccd744ee87ff01a90 | 10.0 | NaN | 5.0 | 30.0 | NaN |
---|
4 | 003f8a76cb823eb7c58b6d052c57a8933f9275fd | 45.0 | 148.0 | 50.0 | 140.0 | 15.0 |
---|
统计各种网络情况
navigation_stats = navigation.groupby(['sku_hash'])['page_views'].sum().reset_index(name='page_views')
sales_stats = sales.groupby(['sku_hash'])['sales_quantity','TotalBuzzPost', 'TotalBuzz','NetSentiment', 'PositiveSentiment', 'NegativeSentiment', 'Impressions'].sum().reset_index()
navigation_stats.head()
| sku_hash | page_views |
---|
0 | 000cb631113e2f54ca5512139a6592e9584957aa | 1866 |
---|
1 | 0020d561eab8b88ab55dfde84a2f12b865e5e0b4 | 16729 |
---|
2 | 0026e7a0fcfe5999a44b70b1acaff00fc1ad3ac2 | 157 |
---|
3 | 00287bbb94c12066df6491dccd744ee87ff01a90 | 3513 |
---|
4 | 003f8a76cb823eb7c58b6d052c57a8933f9275fd | 47784 |
---|
sales_stats.head()
| sku_hash | sales_quantity | TotalBuzzPost | TotalBuzz | NetSentiment | PositiveSentiment | NegativeSentiment | Impressions |
---|
0 | 000cb631113e2f54ca5512139a6592e9584957aa | 114 | 1308458.0 | 1791611.0 | 1471.884478 | 469288.0 | 67668.0 | 8.729030e+09 |
---|
1 | 0020d561eab8b88ab55dfde84a2f12b865e5e0b4 | 365 | 497215.0 | 803633.0 | 851.430118 | 228523.0 | 38871.0 | 4.159187e+09 |
---|
2 | 0026e7a0fcfe5999a44b70b1acaff00fc1ad3ac2 | 110 | 1206926.0 | 1718843.0 | 1380.767295 | 524646.0 | 100366.0 | 1.121427e+10 |
---|
3 | 00287bbb94c12066df6491dccd744ee87ff01a90 | 45 | 475098.0 | 680543.0 | 581.175056 | 211602.0 | 33527.0 | 3.253251e+09 |
---|
4 | 003f8a76cb823eb7c58b6d052c57a8933f9275fd | 398 | 3282466.0 | 4795323.0 | 3742.253384 | 1401916.0 | 307473.0 | 2.389471e+10 |
---|
划分数据集
train['idx'] = pd.Categorical(train.sku_hash).codes
train['idx'] = train['idx'] % 5
train.head()
| ID | month | sku_hash | product_type | product_gender | macro_function | function | sub_function | model | aesthetic_sub_line | ... | target | mean_target | macro_function_count | function_count | sub_function_count | model_count | aesthetic_sub_line_count | macro_material_count | color_count | idx |
---|
0 | 3d8a4ae769b526187c36901f204691a663333fa4_1 | 1 | 3d8a4ae769b526187c36901f204691a663333fa4 | Leather Goods | Women | LG ACCESSORIES | SMALL LEATHER GOODS | COMPACT WALLETS | PF.VICTORINE | DAMIER AZUR | ... | 1366.0 | 471.000000 | 2292 | 2094 | 447 | 90 | 126 | 3213 | 120.0 | 1 |
---|
1 | c05a54f7067be054ec4b27d0d6081353ef7d9df6_1 | 1 | c05a54f7067be054ec4b27d0d6081353ef7d9df6 | Accessories | Women | FANCY ACCESSORIES | BAG CHARMS | BAG CHARMS | BAG CHARM PETITE MALLE | SANS LIGNE ESTHETIQUE | ... | 526.0 | NaN | 546 | 126 | 102 | 3 | 3477 | 3213 | NaN | 1 |
---|
2 | 7cb4d3626bd48a9b523d8693266219c34aeccde8_1 | 1 | 7cb4d3626bd48a9b523d8693266219c34aeccde8 | Leather Goods | Women | CITY BAGS | DAILY BAGS | SHOULDER BAGS | GIROLATA | MAHINA | ... | 503.0 | 297.333333 | 3039 | 2784 | 699 | 15 | 90 | 4089 | 39.0 | 4 |
---|
3 | 8e5967f90ff9fb93aa2840fceecc537b2e8e6ff3_1 | 1 | 8e5967f90ff9fb93aa2840fceecc537b2e8e6ff3 | Leather Goods | Women | CITY BAGS | DAILY BAGS | SHOULDER BAGS | GIROLATA | MAHINA | ... | 276.0 | 373.000000 | 3039 | 2784 | 699 | 15 | 90 | 4089 | 60.0 | 2 |
---|
4 | 6913a128945e0efeafc52101dcdeaa610eaa4430_1 | 1 | 6913a128945e0efeafc52101dcdeaa610eaa4430 | Accessories | Men | FASHION JEWELRY | NECKLACES | NECKLACES | COLL.CHARMS FOR GENTLEMEN | SANS LIGNE ESTHETIQUE | ... | 206.0 | NaN | 843 | 171 | 162 | 3 | 3477 | 1023 | NaN | 0 |
---|
5 rows × 24 columns
整合数据集
X = train.copy()
X = X.merge(navigation_stats, on = 'sku_hash', how = 'left')
X = X.merge(sales_stats, on = 'sku_hash', how = 'left')
X = X.merge(traffic_source_views, on = 'sku_hash', how = 'left')
X = X.merge(type_sales, on = 'sku_hash', how = 'left')
X = X.merge(zone_sales, on = 'sku_hash', how = 'left')
X.head()
| ID | month | sku_hash | product_type | product_gender | macro_function | function | sub_function | model | aesthetic_sub_line | ... | page_views_nav4 | page_views_nav5 | page_views_nav6 | sales_quantity_type1 | sales_quantity_type2 | sales_quantity_zone1 | sales_quantity_zone2 | sales_quantity_zone3 | sales_quantity_zone4 | sales_quantity_zone5 |
---|
0 | 3d8a4ae769b526187c36901f204691a663333fa4_1 | 1 | 3d8a4ae769b526187c36901f204691a663333fa4 | Leather Goods | Women | LG ACCESSORIES | SMALL LEATHER GOODS | COMPACT WALLETS | PF.VICTORINE | DAMIER AZUR | ... | 20.0 | 25.0 | 29.0 | 946.0 | 554.0 | 506.0 | 238.0 | 75.0 | 544.0 | 137.0 |
---|
1 | c05a54f7067be054ec4b27d0d6081353ef7d9df6_1 | 1 | c05a54f7067be054ec4b27d0d6081353ef7d9df6 | Accessories | Women | FANCY ACCESSORIES | BAG CHARMS | BAG CHARMS | BAG CHARM PETITE MALLE | SANS LIGNE ESTHETIQUE | ... | NaN | 39.0 | NaN | 338.0 | 60.0 | 84.0 | 55.0 | 25.0 | 155.0 | 79.0 |
---|
2 | 7cb4d3626bd48a9b523d8693266219c34aeccde8_1 | 1 | 7cb4d3626bd48a9b523d8693266219c34aeccde8 | Leather Goods | Women | CITY BAGS | DAILY BAGS | SHOULDER BAGS | GIROLATA | MAHINA | ... | 10.0 | 10.0 | 5.0 | 429.0 | 145.0 | 190.0 | 55.0 | 35.0 | 195.0 | 99.0 |
---|
3 | 8e5967f90ff9fb93aa2840fceecc537b2e8e6ff3_1 | 1 | 8e5967f90ff9fb93aa2840fceecc537b2e8e6ff3 | Leather Goods | Women | CITY BAGS | DAILY BAGS | SHOULDER BAGS | GIROLATA | MAHINA | ... | 5.0 | 10.0 | 5.0 | 228.0 | 55.0 | 108.0 | 55.0 | 20.0 | 20.0 | 80.0 |
---|
4 | 6913a128945e0efeafc52101dcdeaa610eaa4430_1 | 1 | 6913a128945e0efeafc52101dcdeaa610eaa4430 | Accessories | Men | FASHION JEWELRY | NECKLACES | NECKLACES | COLL.CHARMS FOR GENTLEMEN | SANS LIGNE ESTHETIQUE | ... | NaN | 5.0 | NaN | 110.0 | 25.0 | 35.0 | 5.0 | 20.0 | 30.0 | 45.0 |
---|
5 rows × 45 columns
X = train.copy()
X = X.merge(navigation_stats, on = 'sku_hash', how = 'left')
X = X.merge(sales_stats, on = 'sku_hash', how = 'left')
X = X.merge(traffic_source_views, on = 'sku_hash', how = 'left')
X = X.merge(type_sales, on = 'sku_hash', how = 'left')
X = X.merge(zone_sales, on = 'sku_hash', how = 'left')
X.loc[X.product_type=='Accessories','product_type'] = '0'
X.loc[X.product_type=='Leather Goods','product_type'] = '1'
X.product_type = X.product_type.astype(int)
X.loc[X.product_gender=='Women','product_gender'] = '-1'
X.loc[X.product_gender=='Unisex','product_gender'] = '0'
X.loc[X.product_gender=='Men','product_gender'] = '1'
X.product_gender = X.product_gender.astype(int)
# 变换标签
X['y'] = np.log(X['target']+1)
X.head()
| ID | month | sku_hash | product_type | product_gender | macro_function | function | sub_function | model | aesthetic_sub_line | ... | page_views_nav5 | page_views_nav6 | sales_quantity_type1 | sales_quantity_type2 | sales_quantity_zone1 | sales_quantity_zone2 | sales_quantity_zone3 | sales_quantity_zone4 | sales_quantity_zone5 | y |
---|
0 | 3d8a4ae769b526187c36901f204691a663333fa4_1 | 1 | 3d8a4ae769b526187c36901f204691a663333fa4 | 1 | -1 | LG ACCESSORIES | SMALL LEATHER GOODS | COMPACT WALLETS | PF.VICTORINE | DAMIER AZUR | ... | 25.0 | 29.0 | 946.0 | 554.0 | 506.0 | 238.0 | 75.0 | 544.0 | 137.0 | 7.220374 |
---|
1 | c05a54f7067be054ec4b27d0d6081353ef7d9df6_1 | 1 | c05a54f7067be054ec4b27d0d6081353ef7d9df6 | 0 | -1 | FANCY ACCESSORIES | BAG CHARMS | BAG CHARMS | BAG CHARM PETITE MALLE | SANS LIGNE ESTHETIQUE | ... | 39.0 | NaN | 338.0 | 60.0 | 84.0 | 55.0 | 25.0 | 155.0 | 79.0 | 6.267201 |
---|
2 | 7cb4d3626bd48a9b523d8693266219c34aeccde8_1 | 1 | 7cb4d3626bd48a9b523d8693266219c34aeccde8 | 1 | -1 | CITY BAGS | DAILY BAGS | SHOULDER BAGS | GIROLATA | MAHINA | ... | 10.0 | 5.0 | 429.0 | 145.0 | 190.0 | 55.0 | 35.0 | 195.0 | 99.0 | 6.222576 |
---|
3 | 8e5967f90ff9fb93aa2840fceecc537b2e8e6ff3_1 | 1 | 8e5967f90ff9fb93aa2840fceecc537b2e8e6ff3 | 1 | -1 | CITY BAGS | DAILY BAGS | SHOULDER BAGS | GIROLATA | MAHINA | ... | 10.0 | 5.0 | 228.0 | 55.0 | 108.0 | 55.0 | 20.0 | 20.0 | 80.0 | 5.624018 |
---|
4 | 6913a128945e0efeafc52101dcdeaa610eaa4430_1 | 1 | 6913a128945e0efeafc52101dcdeaa610eaa4430 | 0 | 1 | FASHION JEWELRY | NECKLACES | NECKLACES | COLL.CHARMS FOR GENTLEMEN | SANS LIGNE ESTHETIQUE | ... | 5.0 | NaN | 110.0 | 25.0 | 35.0 | 5.0 | 20.0 | 30.0 | 45.0 | 5.332719 |
---|
5 rows × 46 columns
整合测试集
Z = test.copy()
Z = Z.merge(navigation_stats, on = 'sku_hash', how = 'left')
Z = Z.merge(sales_stats, on = 'sku_hash', how = 'left')
Z = Z.merge(traffic_source_views, on = 'sku_hash', how = 'left')
Z = Z.merge(type_sales, on = 'sku_hash', how = 'left')
Z = Z.merge(zone_sales, on = 'sku_hash', how = 'left')
Z.loc[Z.product_type=='Accessories','product_type'] = '0'
Z.loc[Z.product_type=='Leather Goods','product_type'] = '1'
Z.product_type = Z.product_type.astype(int)
Z.loc[Z.product_gender=='Women','product_gender'] = '-1'
Z.loc[Z.product_gender=='Unisex','product_gender'] = '0'
Z.loc[Z.product_gender=='Men','product_gender'] = '1'
Z.product_gender = Z.product_gender.astype(int)
features = ['product_type', 'product_gender',
'page_views', 'sales_quantity',
'TotalBuzzPost', 'TotalBuzz', 'NetSentiment', 'PositiveSentiment', 'NegativeSentiment', 'Impressions',
'fr_FR_price',
'macro_function_count', 'function_count', 'sub_function_count', 'model_count', 'aesthetic_sub_line_count', 'macro_material_count', 'color_count',
'page_views_nav1', 'page_views_nav2', 'page_views_nav3', 'page_views_nav4', 'page_views_nav5', 'page_views_nav6',
'sales_quantity_type1', 'sales_quantity_type2',
'sales_quantity_zone1','sales_quantity_zone2','sales_quantity_zone3', 'sales_quantity_zone4','sales_quantity_zone5',
'mean_target',]
交叉验证
# 选取某一个月的情况
def train_test_split(tr, te, mo, feats, num_folds):
Xtrain = []
ytrain = []
dtrain = []
Xval = []
yval = []
dval =[]
for i in range(num_folds):
Xtrain.append(tr.loc[(tr.month==mo)&(tr.idx != i),feats].values)
ytrain.append(tr.loc[(tr.month==mo)&(tr.idx != i),'y'].values)
dtrain.append(xgb.DMatrix(Xtrain[i],ytrain[i]))
Xval.append(tr.loc[(tr.month==mo)&(tr.idx == i),feats].values)
yval.append(tr.loc[(tr.month==mo)&(tr.idx == i),'y'].values)
dval.append(xgb.DMatrix(Xval[i],yval[i]))
Xtest = te.loc[(te.month ==mo),feats].values
dtest = xgb.DMatrix(Xtest)
return dtrain, dval, dtest
Xgboost参数
param = {}
param['objective'] = 'reg:linear'
param['eval_metric'] = 'rmse'
param['booster'] = 'gbtree'
param['eta'] = 0.025
param['subsample'] = 0.7
param['colsample_bytree'] = 0.7
param['num_parallel_tree'] = 3
param['min_child_weight'] = 25
param['gamma'] = 5
param['max_depth'] = 3
param['silent'] = 1
第一个月的情况
# train models for the 1 month
dtrain, dval, dtest = train_test_split(tr = X, te = Z, mo = 1, feats = features, num_folds = 5)
model_m1 = []
for i in range(5):
model_m1.append(
xgb.train(
param,
dtrain[i],
50000,
[(dtrain[i],'train'), (dval[i],'eval')],
early_stopping_rounds = 200,
verbose_eval = False)
)
# run predictions for the 1 month
oof_m1 = []
oof_test_m1 = []
for i in range(5):
oof_m1.append(model_m1[i].predict(dval[i]))
oof_test_m1.append(model_m1[i].predict(dtest))
test_m1 = np.mean(oof_test_m1, axis=0)
m1 = {}
for i in range(5):
m1 = {**m1, **dict(zip(X.loc[(X.month==1) & (X.idx==i),'sku_hash'], oof_m1[i]))}
m1 = {**m1, **dict(zip(Z.loc[(Z.month==1),'sku_hash'], test_m1))}
oof_m1 = pd.DataFrame.from_dict(m1, orient='index').reset_index()
oof_m1.columns = ['sku_hash', 'oof_m1']
X2 = pd.merge(X.copy(), oof_m1, on = 'sku_hash')
Z2 = pd.merge(Z.copy(), oof_m1, on = 'sku_hash')
features2 = features + ['oof_m1']
第二个月情况
dtrain2, dval2, dtest2 = train_test_split(tr = X2, te = Z2, mo = 2, feats = features2, num_folds = 5)
model_m2 = []
for i in range(5):
model_m2.append(
xgb.train(
param,
dtrain2[i],
50000,
[(dtrain2[i],'train'), (dval2[i],'eval')],
early_stopping_rounds = 200,
verbose_eval = False)
)
# run predictions for the 2 month
oof_m2 = []
oof_test_m2 = []
for i in range(5):
oof_m2.append(model_m2[i].predict(dval2[i]))
oof_test_m2.append(model_m2[i].predict(dtest2))
test_m2 = np.mean(oof_test_m2, axis=0)
m2 = {}
for i in range(5):
m2 = {**m2, **dict(zip(X.loc[(X.month==2) & (X.idx==i),'sku_hash'], oof_m2[i]))}
m2 = {**m2, **dict(zip(Z.loc[(Z.month==2),'sku_hash'], test_m2))}
oof_m2 = pd.DataFrame.from_dict(m2, orient='index').reset_index()
oof_m2.columns = ['sku_hash', 'oof_m2']
X3 = pd.merge(X2.copy(), oof_m2, on = 'sku_hash')
Z3 = pd.merge(Z2.copy(), oof_m2, on = 'sku_hash')
features3 = features2 + ['oof_m2']
第三个月情况
dtrain3, dval3, dtest3 = train_test_split(tr = X3, te = Z3, mo = 3, feats = features3, num_folds = 5)
model_m3 = []
for i in range(5):
model_m3.append(
xgb.train(
param,
dtrain3[i],
50000,
[(dtrain3[i],'train'),(dval3[i],'eval')],
early_stopping_rounds = 200,
verbose_eval = False)
)
# run predictions for the 3 month
oof_m3 = []
oof_test_m3 = []
for i in range(5):
oof_m3.append(model_m3[i].predict(dval3[i]))
oof_test_m3.append(model_m3[i].predict(dtest3))
test_m3 = np.mean(oof_test_m3, axis=0)
m3 = {}
for i in range(5):
m3 = {**m3, **dict(zip(X.loc[(X.month==3) & (X.idx==i),'sku_hash'], oof_m3[i]))}
m3 = {**m3, **dict(zip(Z.loc[(Z.month==3),'sku_hash'], test_m3))}
oof_m3 = pd.DataFrame.from_dict(m3, orient='index').reset_index()
oof_m3.columns = ['sku_hash', 'oof_m3']
X3 = pd.merge(X3.copy(), oof_m3, on = 'sku_hash')
Z3 = pd.merge(Z3.copy(), oof_m3, on = 'sku_hash')
# 方便评估,设定一个
Z3['target'] = 0
Z3.loc[Z3.month == 1, 'target'] = Z3.loc[Z3.month == 1, 'oof_m1']
Z3.loc[Z3.month == 2, 'target'] = Z3.loc[Z3.month == 2, 'oof_m2']
Z3.loc[Z3.month == 3, 'target'] = Z3.loc[Z3.month == 3, 'oof_m3']
X3['pred_target'] = 0
X3.loc[X3.month == 1, 'pred_target'] = X3.loc[X3.month == 1, 'oof_m1']
X3.loc[X3.month == 2, 'pred_target'] = X3.loc[X3.month == 2, 'oof_m2']
X3.loc[X3.month == 3, 'pred_target'] = X3.loc[X3.month == 3, 'oof_m3']
评估结果
print(f"month1: {np.sqrt(np.mean((X3.loc[X3.month==1,'y'] - X3.loc[X3.month==1,'pred_target'])**2))}")
print(f"month2: {np.sqrt(np.mean((X3.loc[X3.month==2,'y'] - X3.loc[X3.month==2,'pred_target'])**2))}")
print(f"month3: {np.sqrt(np.mean((X3.loc[X3.month==3,'y'] - X3.loc[X3.month==3,'pred_target'])**2))}")
print(f"overall: {np.sqrt(np.mean((X3['y'] - X3['pred_target'])**2))}")
month1: 0.4401109610656908
month2: 0.5816163893598695
month3: 0.7147597943809006
overall: 0.5895921884618234
生成结果
Z3['target'] = np.exp(Z3.target)-1
final_sub = Z3[['ID','target']]
final_sub.to_csv(os.path.join(base_path,'silly-raddar-sub4.csv'),index=None)
final = pd.read_csv('./data/silly-raddar-sub4.csv')
final.head()
| ID | target |
---|
0 | ed4c7471eac7e8c6e6718364c2b6e75462eeb47c_1 | 300.303182 |
---|
1 | ed4c7471eac7e8c6e6718364c2b6e75462eeb47c_2 | 301.345169 |
---|
2 | ed4c7471eac7e8c6e6718364c2b6e75462eeb47c_3 | 212.500844 |
---|
3 | 26b2c4f6281482cccf1e748ef388f1649ecf1c8b_1 | 85.048020 |
---|
4 | 26b2c4f6281482cccf1e748ef388f1649ecf1c8b_2 | 71.009720 |
---|
本文标签:
销售额商品
发表评论