fromseleniumimportwebdriver
importtime
fromlxmlimportetree
importre
classLagouSpider(object):
def__init__(self):
self.driver=webdriver.Chrome()
self.url="https://www.lagou.com/jobs/list_python?px=default&city=%E5%85%A8%E5%9B%BD#filterBox"
defrun(self):
self.driver.get(self.url)
whileTrue:
source=self.driver.page_source
self.parse_page_list(source)
next_btn=self.driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]")
if"pager_next_disabled"innext_btn.get_attribute("class"):
break
else:
next_btn.click()
defparse_page_list(self,source):
html=etree.HTML(source)
detail_urls=html.xpath("//div/a[@class='position_link']/@href")
fordetail_urlindetail_urls:
self.get_detail_page(detail_url)
time.sleep(1)
defget_detail_page(self,detail_url):
#self.driver.get(detail_url)
#新打开一个窗口
self.driver.execute_script("window.open('%s')"%detail_url)
self.driver.switch_to.window(self.driver.window_handles[1])
source=self.driver.page_source
self.parse_datail_page(source)
#关闭该窗口
self.driver.close()
#继续切换回职位列表页
self.driver.switch_to.window(self.driver.window_handles[0])
defparse_datail_page(self,source):
html=etree.HTML(source)
job_name=html.xpath("//div[@class='job-name']/h2/text()")[0].strip()
job_request_spans=html.xpath("//dd[@class='job_request']//span")
job_salary=job_request_spans[0].xpath("./text()")[0].strip()
city=job_request_spans[1].xpath("./text()")[0].strip()
city=re.sub(r'[/\s]','',city)
work_year=job_request_spans[2].xpath("./text()")[0].strip()
work_year=re.sub(r'[/\s]','',work_year)
education=job_request_spans[3].xpath("./text()")[0].strip()
education=re.sub(r'[/\s]','',education)
company_name=html.xpath("//h3[@class='fl']//text()")[0].strip()
desc="".join(html.xpath("//dl[@id='job_detail']/dd[@class='job_bt']//text()")).strip()
desc=re.sub(r'[/\s\\xa]','',desc)
position={
"name":job_name,
"job_salary":job_salary,
"city":city,
"work_year":work_year,
"education":education,
"company_name":company_name,
"desc":desc
}
print(position)
lagou=LagouSpider()
lagou.run()
ViewCode
如需转载,请注明文章出处和来源网址:http://www.divcss5.com/html/h54706.shtml