1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Fri Jun 29 20:30:12 2018
4
+
5
+ @author: situ
6
+ """
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+ import os
11
+ import re
12
+
13
+ #os.chdir("E:/graduate/class/EDA/final")
14
+ os .chdir ("/Users/situ/Documents/EDA/final" )
15
+ data = pd .read_csv ("data_with_skill.csv" ,encoding = "gbk" )
16
+ data .head ()
17
+ data .info ()
18
+
19
+ data .drop (["jobname" ,"jobgood" ,"url" ,"city" ],axis = 1 ,inplace = True )
20
+ #数值型数据处理----------------------
21
+ #每周工作天数
22
+ data .jobway .unique ()
23
+ mapping = {}
24
+ for i in range (2 ,7 ):
25
+ mapping [str (i ) + '天/周' ] = i
26
+ print (mapping )
27
+ data ['day_per_week' ] = data ['jobway' ].map (mapping )
28
+ data ['day_per_week' ].head ()
29
+
30
+
31
+ #公司规模
32
+ data ["size" ].unique ()
33
+ data ["comp_size" ] = ""
34
+ data ["comp_size" ][data ['size' ] == '少于15人' ] = '小型企业'
35
+ data ["comp_size" ][data ['size' ] == '15-50人' ] = '小型企业'
36
+ data ["comp_size" ][data ['size' ] == '50-150人' ] = '中型企业'
37
+ data ["comp_size" ][data ['size' ] == '150-500人' ] = '中型企业'
38
+ data ["comp_size" ][data ['size' ] == '500-2000人' ] = '大型企业'
39
+ data ["comp_size" ][data ['size' ] == '2000人以上' ] = '大型企业'
40
+
41
+ #实习月数
42
+ data .month .unique ()
43
+ mapping = {}
44
+ for i in range (1 ,22 ):
45
+ mapping ["实习" + str (i ) + '个月' ] = i
46
+ print (mapping )
47
+ data ['time_span' ] = data ['month' ].map (mapping )
48
+ data ['time_span' ].apply (lambda f :int (f ))
49
+
50
+ #每天工资
51
+ def get_mean_salary (s ):
52
+ return np .mean ([int (i ) for i in s [:(len (s )- 2 )].split ("-" )])
53
+ data ['average_wage' ] = data ['salary' ].apply (lambda s :get_mean_salary (s ))
54
+ data ['average_wage' ].head ()
55
+
56
+ data .drop (['jobway' ,'size' ,'month' ,'salary' ], axis = 1 ,inplace = True )
57
+
58
+ #字符型数据处理--------------------------------
59
+ #(城市)处理
60
+ #北京、上海、杭州、深圳、广州
61
+
62
+ def get_less_dummies (data ,feature ,useful_classes ,prefix ):
63
+ useful_classes_prefix = [prefix + "_" + token for token in useful_classes ]
64
+ dum = pd .get_dummies (data [feature ],prefix = prefix ).ix [:,useful_classes_prefix ]
65
+ if sum (np .sum (dum .isnull ()))> 0 :
66
+ dum = dum .fillna (0 )
67
+ search_index = np .where (np .sum (dum ,axis = 1 )== 0 )[0 ]
68
+ for j in range (len (useful_classes )):
69
+ token = useful_classes [j ]
70
+ for i in search_index :
71
+ if len (re .findall (token ,data .ix [i ,feature ]))> 0 :
72
+ dum .ix [i ,useful_classes_prefix [j ]] = 1
73
+ # print(dum.head())
74
+
75
+ data = pd .concat ([data ,dum ],axis = 1 )
76
+ return data
77
+
78
+ feature = "address"
79
+ useful_classes = ["北京" ,"上海" ,"杭州" ,"深圳" ,"广州" ,"成都" ,"武汉" ]
80
+ data = get_less_dummies (data ,feature ,useful_classes ,prefix = "city" )
81
+
82
+ #行业
83
+ #互联网,计算机,金融,电子商务和企业服务
84
+
85
+
86
+
87
+ feature = "industry"
88
+ useful_classes = ["互联网" ,"计算机" ,"金融" ,"电子商务" ,"企业服务" ,"广告" ,"文化传媒" ,"电子" ,"通信" ]
89
+ data = get_less_dummies (data ,feature ,useful_classes ,"industry" )
90
+
91
+ data .head ()
92
+
93
+
94
+ data .drop (['address' ,'industry' ], axis = 1 ,inplace = True )
95
+
96
+
97
+ #专业要求
98
+ def get_imp_info (data ,feature ,useful_classes ,prefix ):
99
+ """直接从文本中提取"""
100
+ useful_classes_prefix = [prefix + "_" + token for token in useful_classes ]
101
+ dum = pd .DataFrame (np .zeros ((len (data ),len (useful_classes ))),columns = useful_classes_prefix )
102
+ dum = dum .fillna (0 )
103
+ for j in range (len (useful_classes )):
104
+ token = useful_classes [j ]
105
+ # print(token)
106
+ for i in range (len (data )):
107
+ # print(i)
108
+ if len (re .findall (token ,data .ix [i ,feature ].lower ()))> 0 :
109
+ dum .ix [i ,useful_classes_prefix [j ]] = 1
110
+ print (dum .head ())
111
+
112
+ # data = pd.concat([data,dum],axis = 1)
113
+ return dum
114
+
115
+
116
+ feature = "contents"
117
+ useful_classes = ["统计" ,"计算机" ,"数学" ]
118
+ dum = get_imp_info (data ,feature ,useful_classes ,"subject" )
119
+ data = pd .concat ([data ,dum ],axis = 1 )
120
+ data .head ()
121
+
122
+ #技能要求
123
+ def get_imp_info2 (data ,feature ,useful_classes ,prefix ):
124
+ """从分词中提取"""
125
+ useful_classes_prefix = [prefix + "_" + token for token in useful_classes ]
126
+ dum = pd .DataFrame (np .zeros ((len (data ),len (useful_classes ))),columns = useful_classes_prefix )
127
+ dum = dum .fillna (0 )
128
+ for j in range (len (useful_classes )):
129
+ token = useful_classes [j ]
130
+ # print(token)
131
+ for i in range (len (data )):
132
+ word_list = data .ix [i ,feature ].split ()
133
+ if token in word_list :
134
+ print (data .ix [i ,feature ])
135
+ dum .ix [i ,useful_classes_prefix [j ]] = 1
136
+ print (dum .head ())
137
+
138
+ # data = pd.concat([data,dum],axis = 1)
139
+ return dum
140
+
141
+
142
+ feature = "contents"
143
+ #useful_classes = ["python","r语言","spss","excel","ppt","word","sql","sas","vba","office","msoffice",
144
+ # "hadoop","spark","hive","scala","hbase","java","matlab","linux","shell","c#"]
145
+ # "机器学习","数据挖掘","数学建模","自然语言处理","自然语言","文本挖掘",
146
+ useful_classes = ['excel' , 'sql' , 'python' , 'sas' , 'spss' ,'hadoop' , 'spark' , 'hive' , 'shell' , 'java' ]
147
+ dum = get_imp_info (data ,feature ,useful_classes ,"skill" )
148
+ np .sum (dum )
149
+ # 技能要求前10:excel sql python sas spss | hadoop spark hive shell java
150
+ data = pd .concat ([data ,dum ],axis = 1 )
151
+ data .head ()
152
+
153
+ #技能与平均薪资
154
+ def mean_salary (useful_classes ,data ,salary ,prefix ):
155
+ feature_list = [prefix + "_" + skill for skill in useful_classes ]
156
+ p = len (feature_list )
157
+ df = pd .DataFrame (np .zeros ((p ,3 )),columns = ["skill" ,"mean_salary" ,"count" ])
158
+ df ["skill" ] = useful_classes
159
+ for i in range (p ):
160
+ df ["mean_salary" ][df ["skill" ]== useful_classes [i ]] = np .mean (data [salary ][data [feature_list [i ]]== 1 ])
161
+ df ["count" ][df ["skill" ]== useful_classes [i ]] = len (data [salary ][data [feature_list [i ]]== 1 ])
162
+ return df
163
+
164
+ useful_classes = ['excel' , 'sql' , 'python' , 'sas' , 'spss' ,'hadoop' , 'spark' , 'hive' , 'shell' , 'java' ]
165
+ salary = "average_wage"
166
+ prefix = "skill"
167
+ df = mean_salary (useful_classes ,data ,salary ,prefix )
168
+
169
+ import matplotlib .pyplot as plt
170
+ import seaborn as sns
171
+
172
+ plt .style .use ('ggplot' )
173
+ plt .figure (figsize = (8 ,5 ))
174
+ sns .stripplot (x = "skill" ,y = "mean_salary" ,data = df ,size = 10 )
175
+ plt .xlabel ("skill_software" )
176
+ plt .ylabel ("mean_salary" )
177
+ plt .savefig ("skill_salary.jpg" )
178
+
179
+ # 公司
180
+ data ["compname" ].value_counts ()
181
+
182
+
183
+ data .drop (['compname' ], axis = 1 ,inplace = True )
184
+ #data = pd.get_dummies(data)
185
+
186
+ #data.to_csv("data_analysis.csv",index = False,encoding = "gbk")
187
+
188
+
189
+ from sklearn .linear_model import LinearRegression
190
+ X = data .drop (["average_wage" ,'contents' ,'kmeans' ,'gmm' ,'nmf' ,"skill_text" ,"index" ,"compname" ],axis = 1 );Y = data ["average_wage" ]
191
+ X = pd .get_dummies (X )
192
+ regr = LinearRegression ().fit (X ,Y )
193
+ #输出R的平方
194
+ print (regr .score (X ,Y ))
195
+ regr .coef_
196
+
197
+
198
+
199
+
200
+ #职位诱惑可以做词云图
0 commit comments