-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathvisualize-data.py
72 lines (50 loc) · 1.52 KB
/
visualize-data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# -*- coding: utf-8 -*-
"""
Created on Mon Feb 12 15:41:26 2018
@author: Liaowei
"""
'''
数据查看
1、查看每个特征的缺失数量,数据缺失太多的直接丢弃
2、查看每个特征的众数,用众数来填补缺失值
3、筛选出类别型数据和连续型数据
4、查看每个特征的相关系数,强相关的特征去除
5、查看连续值的分布情况,对连续值做处理
6、
'''
import pandas as pd
import numpy as np
import time
import datetime
import matplotlib.pyplot as plt
train_df = pd.read_csv('../data/f_train_20180204.csv',encoding='gb2312')
feature = train_df.count()
drop_feature = feature[feature<500]
print('count:\n',drop_feature)
print('median:\n',train_df.median())
category_feature = ['SNP'+str(i) for i in range(1,56)]
category_feature.extend(['DM家族史', 'ACEID'])
corr = train_df[train_df.columns].corr()
plt.figure()
plt.matshow(corr)
plt.colorbar()
plt.show()
'''
孕前体重和孕前BMI,BMI分类及分娩时有很大的相关性
'''
count = 0
del_feature = []
for col in corr.columns:
corr_data = corr[col][:count]
corr_data = corr_data[corr_data>0.8]
del_feature.extend(corr_data.index.values)
count += 1
del_feature = list(set(del_feature))
'''
VAR00007基本上是正态分布
'''
VAR007_temp = pd.DataFrame({'VAR':train_df['VAR00007'],'log(VAR+1)':np.log1p(train_df['VAR00007'])})
VAR007_temp.hist()
#train_df.groupby('label').mean().plot(y='VAR00007',marker='o')
VAR007_temp = pd.DataFrame({'VAR':train_df['VAR00007']})
VAR007_temp.hist()