Skip to content

Commit a90ece2

Browse files
author
situxy
committed
add files
1 parent 7b28bbb commit a90ece2

8 files changed

+1524
-0
lines changed

analysis.py

Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Fri Jun 29 20:30:12 2018
4+
5+
@author: situ
6+
"""
7+
8+
import numpy as np
9+
import pandas as pd
10+
import os
11+
import re
12+
13+
#os.chdir("E:/graduate/class/EDA/final")
14+
os.chdir("/Users/situ/Documents/EDA/final")
15+
data = pd.read_csv("data_with_skill.csv",encoding = "gbk")
16+
data.head()
17+
data.info()
18+
19+
data.drop(["jobname","jobgood","url","city"],axis = 1,inplace = True)
20+
#数值型数据处理----------------------
21+
#每周工作天数
22+
data.jobway.unique()
23+
mapping = {}
24+
for i in range(2,7):
25+
mapping[str(i) + '天/周'] = i
26+
print(mapping)
27+
data['day_per_week'] = data['jobway'].map(mapping)
28+
data['day_per_week'].head()
29+
30+
31+
#公司规模
32+
data["size"].unique()
33+
data["comp_size"] = ""
34+
data["comp_size"][data['size'] == '少于15人'] = '小型企业'
35+
data["comp_size"][data['size'] == '15-50人'] = '小型企业'
36+
data["comp_size"][data['size'] == '50-150人'] = '中型企业'
37+
data["comp_size"][data['size'] == '150-500人'] = '中型企业'
38+
data["comp_size"][data['size'] == '500-2000人'] = '大型企业'
39+
data["comp_size"][data['size'] == '2000人以上'] = '大型企业'
40+
41+
#实习月数
42+
data.month.unique()
43+
mapping = {}
44+
for i in range(1,22):
45+
mapping["实习"+str(i) + '个月'] = i
46+
print(mapping)
47+
data['time_span'] = data['month'].map(mapping)
48+
data['time_span'].apply(lambda f:int(f))
49+
50+
#每天工资
51+
def get_mean_salary(s):
52+
return np.mean([int(i) for i in s[:(len(s)-2)].split("-")])
53+
data['average_wage'] = data['salary'].apply(lambda s:get_mean_salary(s))
54+
data['average_wage'].head()
55+
56+
data.drop(['jobway','size','month','salary'], axis = 1,inplace=True)
57+
58+
#字符型数据处理--------------------------------
59+
#(城市)处理
60+
#北京、上海、杭州、深圳、广州
61+
62+
def get_less_dummies(data,feature,useful_classes,prefix):
63+
useful_classes_prefix = [prefix+"_"+token for token in useful_classes]
64+
dum = pd.get_dummies(data[feature],prefix=prefix).ix[:,useful_classes_prefix]
65+
if sum(np.sum(dum.isnull()))>0:
66+
dum = dum.fillna(0)
67+
search_index = np.where(np.sum(dum,axis=1)==0)[0]
68+
for j in range(len(useful_classes)):
69+
token = useful_classes[j]
70+
for i in search_index:
71+
if len(re.findall(token,data.ix[i,feature]))>0:
72+
dum.ix[i,useful_classes_prefix[j]] = 1
73+
# print(dum.head())
74+
75+
data = pd.concat([data,dum],axis = 1)
76+
return data
77+
78+
feature = "address"
79+
useful_classes = ["北京","上海","杭州","深圳","广州","成都","武汉"]
80+
data = get_less_dummies(data,feature,useful_classes,prefix="city")
81+
82+
#行业
83+
#互联网,计算机,金融,电子商务和企业服务
84+
85+
86+
87+
feature = "industry"
88+
useful_classes = ["互联网","计算机","金融","电子商务","企业服务","广告","文化传媒","电子","通信"]
89+
data = get_less_dummies(data,feature,useful_classes,"industry")
90+
91+
data.head()
92+
93+
94+
data.drop(['address','industry'], axis = 1,inplace=True)
95+
96+
97+
#专业要求
98+
def get_imp_info(data,feature,useful_classes,prefix):
99+
"""直接从文本中提取"""
100+
useful_classes_prefix = [prefix+"_"+token for token in useful_classes]
101+
dum = pd.DataFrame(np.zeros((len(data),len(useful_classes))),columns = useful_classes_prefix)
102+
dum = dum.fillna(0)
103+
for j in range(len(useful_classes)):
104+
token = useful_classes[j]
105+
# print(token)
106+
for i in range(len(data)):
107+
# print(i)
108+
if len(re.findall(token,data.ix[i,feature].lower()))>0:
109+
dum.ix[i,useful_classes_prefix[j]] = 1
110+
print(dum.head())
111+
112+
# data = pd.concat([data,dum],axis = 1)
113+
return dum
114+
115+
116+
feature = "contents"
117+
useful_classes = ["统计","计算机","数学"]
118+
dum = get_imp_info(data,feature,useful_classes,"subject")
119+
data = pd.concat([data,dum],axis = 1)
120+
data.head()
121+
122+
#技能要求
123+
def get_imp_info2(data,feature,useful_classes,prefix):
124+
"""从分词中提取"""
125+
useful_classes_prefix = [prefix+"_"+token for token in useful_classes]
126+
dum = pd.DataFrame(np.zeros((len(data),len(useful_classes))),columns = useful_classes_prefix)
127+
dum = dum.fillna(0)
128+
for j in range(len(useful_classes)):
129+
token = useful_classes[j]
130+
# print(token)
131+
for i in range(len(data)):
132+
word_list = data.ix[i,feature].split()
133+
if token in word_list:
134+
print(data.ix[i,feature])
135+
dum.ix[i,useful_classes_prefix[j]] = 1
136+
print(dum.head())
137+
138+
# data = pd.concat([data,dum],axis = 1)
139+
return dum
140+
141+
142+
feature = "contents"
143+
#useful_classes = ["python","r语言","spss","excel","ppt","word","sql","sas","vba","office","msoffice",
144+
# "hadoop","spark","hive","scala","hbase","java","matlab","linux","shell","c#"]
145+
# "机器学习","数据挖掘","数学建模","自然语言处理","自然语言","文本挖掘",
146+
useful_classes = ['excel', 'sql', 'python', 'sas', 'spss','hadoop', 'spark', 'hive', 'shell', 'java']
147+
dum = get_imp_info(data,feature,useful_classes,"skill")
148+
np.sum(dum)
149+
# 技能要求前10:excel sql python sas spss | hadoop spark hive shell java
150+
data = pd.concat([data,dum],axis = 1)
151+
data.head()
152+
153+
#技能与平均薪资
154+
def mean_salary(useful_classes,data,salary,prefix):
155+
feature_list = [prefix+"_"+skill for skill in useful_classes]
156+
p = len(feature_list)
157+
df = pd.DataFrame(np.zeros((p,3)),columns = ["skill","mean_salary","count"])
158+
df["skill"] = useful_classes
159+
for i in range(p):
160+
df["mean_salary"][df["skill"]==useful_classes[i]] = np.mean(data[salary][data[feature_list[i]]==1])
161+
df["count"][df["skill"]==useful_classes[i]] = len(data[salary][data[feature_list[i]]==1])
162+
return df
163+
164+
useful_classes = ['excel', 'sql', 'python', 'sas', 'spss','hadoop', 'spark', 'hive', 'shell', 'java']
165+
salary = "average_wage"
166+
prefix = "skill"
167+
df = mean_salary(useful_classes,data,salary,prefix)
168+
169+
import matplotlib.pyplot as plt
170+
import seaborn as sns
171+
172+
plt.style.use('ggplot')
173+
plt.figure(figsize=(8,5))
174+
sns.stripplot(x = "skill",y="mean_salary",data=df,size = 10)
175+
plt.xlabel("skill_software")
176+
plt.ylabel("mean_salary")
177+
plt.savefig("skill_salary.jpg")
178+
179+
# 公司
180+
data["compname"].value_counts()
181+
182+
183+
data.drop(['compname'], axis = 1,inplace=True)
184+
#data = pd.get_dummies(data)
185+
186+
#data.to_csv("data_analysis.csv",index = False,encoding = "gbk")
187+
188+
189+
from sklearn.linear_model import LinearRegression
190+
X = data.drop(["average_wage",'contents','kmeans','gmm','nmf',"skill_text","index","compname"],axis = 1);Y = data["average_wage"]
191+
X = pd.get_dummies(X)
192+
regr = LinearRegression().fit(X,Y)
193+
#输出R的平方
194+
print(regr.score(X,Y))
195+
regr.coef_
196+
197+
198+
199+
200+
#职位诱惑可以做词云图

crawl_shixiseng.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Sun Jun 17 20:21:59 2018
4+
5+
@author: situ
6+
"""
7+
8+
import requests,re,time
9+
import os
10+
import pandas as pd
11+
import numpy as np
12+
from urllib.parse import urlencode
13+
from lxml import etree
14+
15+
16+
headers = {
17+
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
18+
}
19+
20+
replace_dict={
21+
"&#xf09f":"0",
22+
"&#xeff8":"1",
23+
"&#xecfa":"2",
24+
"&#xf748":"3",
25+
"&#xf298":"4",
26+
"&#xed58":"5",
27+
"&#xee56":"6",
28+
"&#xe253":"7",
29+
"&#xe504":"8",
30+
"&#xecfd":"9"}
31+
def get_links(start_url,n,replace_dict):
32+
all_pd = pd.DataFrame()
33+
for i in list(range(1,n+1)):
34+
print("————————————正在爬取第%d页招聘信息———————————————"%i)
35+
url = start_url+"&p=%s"%str(i)
36+
try:
37+
wb_data = requests.get(url,headers=headers)
38+
wb_data.encoding=wb_data.apparent_encoding
39+
links = re.findall('class="name-box clearfix".*?href="(.*?)"',wb_data.text,re.S)
40+
for link in links:
41+
print(link)
42+
try:
43+
one_pd = get_infos('https://www.shixiseng.com'+link,replace_dict)
44+
except:
45+
one_pd = pd.DataFrame({"url":link,"jobname":"","salary":"","address":"",
46+
"education":"","jobway":"","month":"",
47+
"jobgood":"","contents":"","compname":"",
48+
"city":"","size":"","industry":""})
49+
print("can't crawl"+link)
50+
all_pd = all_pd.append(one_pd)
51+
except:
52+
print("can't reach page %d"%i)
53+
pass
54+
55+
return all_pd
56+
57+
def get_infos(url,replace_dict):
58+
one_dict = {}
59+
wb_data = requests.get(url,headers=headers)
60+
print(wb_data.status_code)
61+
wb_data.encoding=wb_data.apparent_encoding
62+
jobname = re.findall('<div class="new_job_name" title="(.*?)">',wb_data.text,re.S)
63+
salarys = re.findall('class="job_money cutom_font">(.*?)</span>',wb_data.text,re.S)
64+
addresses = re.findall('class="job_position">(.*?)</span>',wb_data.text,re.S)
65+
educations = re.findall('class="job_academic">(.*?)</span>',wb_data.text,re.S)
66+
jobways = re.findall('class="job_week cutom_font">(.*?)</span>',wb_data.text,re.S)
67+
months = re.findall('class="job_time cutom_font">(.*?)</span>',wb_data.text,re.S)
68+
jobgoods = re.findall('class="job_good".*?>(.*?)</div>',wb_data.text,re.S)
69+
contents = re.findall(r'div class="job_til">([\s\S]*?)<div class="job_til">', wb_data.text, re.S)[0].replace(' ','').replace('\n', '').replace('&nbsp;', '')
70+
contents = re.sub(r'<[\s\S]*?>', "", str(contents))
71+
compname = re.findall('class="job_com_name">(.*?)</div>',wb_data.text,re.S)
72+
compintro = re.findall('<div class="job_detail job_detail_msg"><span>([\s\S]*?)</span></div>',wb_data.text,re.S)
73+
city,size,industry = re.sub(r'<[\s\S]*?>', " ", str(compintro[0])).split()
74+
for salary,address,education,jobway,month,jobgood in zip(salarys,addresses,educations,jobways,months,jobgoods):
75+
for key, vaule in replace_dict.items():
76+
salary = salary.replace(key, vaule)
77+
jobway = jobway.replace(key,vaule)
78+
month = month.replace(key,vaule)
79+
one_dict = {"url":url,"jobname":jobname,"salary":salary,"address":address,
80+
"education":education,"jobway":jobway,"month":month,
81+
"jobgood":jobgood,"contents":contents,"compname":compname,
82+
"city":city,"size":size,"industry":industry}
83+
# list_i=[url,salary,address,education,jobway,month,jobgood,contents,compname,city,size,industry]
84+
print(jobname)
85+
one_pd = pd.DataFrame(one_dict)
86+
return one_pd
87+
88+
89+
if __name__ == '__main__':
90+
os.chdir("E:/graduate/class/EDA/final")
91+
print('请输入您想爬取内容的关键字:')
92+
compRawStr = input('关键字: \n') #键盘读入 多个关键字则用空格隔开
93+
print('正在爬取“' + compRawStr.capitalize()+ '”有关实习信息!')
94+
d = {'k': compRawStr.encode('utf-8')}
95+
word = urlencode(d)
96+
97+
start_url = "https://www.shixiseng.com/interns/st-intern_c-None_?%s" %word
98+
result = requests.get(start_url,headers=headers)
99+
# result.status_code
100+
result.encoding = 'utf-8'
101+
selector = etree.HTML(result.text)
102+
last_page_link = selector.xpath('//*[@id="pagebar"]/ul/li[10]/a/@href')
103+
n = int(last_page_link[0].split("p=")[1])
104+
print("将爬取%d页的招聘信息"%n)
105+
time_start=time.time()
106+
df = get_links(start_url,n,replace_dict)
107+
df.to_csv(compRawStr+"_共"+str(n)+"页.csv",index = False,encoding = "gb18030")
108+
time_end=time.time()
109+
print("成功爬取%d条关于【%s】的招聘信息"%(len(df),compRawStr))
110+
print('totally cost %f seconds'%(time_end-time_start))
111+
112+

salary_and_skill.png

72.5 KB
Loading

tagxedo.png

1.21 MB
Loading

0 commit comments

Comments
 (0)