公开
创建于 by 廖礼斌

python

` from bs4 import BeautifulSoup import lxml import re import requests import string import json import pymysql from selenium import webdriver import time as time_pl from lxml import etree

def get_html(url): # 伪装成浏览器访问 #Mozilla/5.0 (Windows; U; MISE 9.0; Windows NT 9.0; en-US) #Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 headers = { 'User-Agent': 'Mozilla/5.0 (Windows; U; MISE 9.0; Windows NT 9.0; en-US)'} resp = requests.get(url,headers=headers) resp.encoding = resp.apparent_encoding return resp.text

def get_urls(base_url): #获取公司首页信息 soup = etree.HTML(get_html(base_url)) urls_list = [] sun_a = soup.xpath('//li[@class="company-item"]//p/a/@href') for i in range(0, len(sun_a)): if i % 2 == 0: urls_list.append(sun_a[i]) print(urls_list, len(urls_list)) return (urls_list)

def baochun(stra): connection = pymysql.connect( host='localhost', port=3306, user='root', password='990712llb', db='shop', charset='utf8' ) cursor = connection.cursor() sql_insert1="insert into gonsijiesao values (%s,%s,%s,%s,%s,%s,%s,%s);" if cursor.execute(sql_insert1, stra): connection.commit() print("成功") connection.close()

def get_gonsi(url): #爬取公司信息 driver = webdriver.Chrome() driver.get(url) txt = driver.page_source soup = etree.HTML(txt) scom = re.search( '

.?<a.?>(.?).?
.?(.?).?(.?).?(.?).?(.?).?成立时间.?class="content">(.?)
', txt, re.S) print(scom) name=scom.group(1) name=name.replace('\n', '').replace(' ', '') leixing=scom.group(2) zijin=scom.group(3) guimo_1=re.match('^(.?)-(.*?)人',scom.group(4)) if guimo_1: guimo_min = guimo_1.group(1) guimo_max = guimo_1.group(2) else: guimo_min="2000" guimo_max="100000" dizhi=scom.group(5) time=scom.group(6) time=time[:4] gonsi_info = soup.xpath( '//div[@id="container_left"]//div[@class="company_intro_text"]//text()') info = '' for a in gonsi_info[:-2]: info = info + a.replace('\n', '').replace(' ', '') str_list=(name,info,guimo_max,guimo_min,dizhi,leixing,zijin,time) print(str_list) driver.quit() try: baochun(str_list) print("成功") except: print("失败")

def baocun_zhaopin(com): connection = pymysql.connect( host='localhost', port=3306, user='root', password='990712llb', db='shop', charset='utf8' ) cursor = connection.cursor() sql_insert1 = "insert into shehuizhaopin values (%s,%s,%s,%s,%s,%s,%s,%s,%s);" if cursor.execute(sql_insert1,com): connection.commit() print("插入数据库成功") else: print("插入数据库失败") connection.close() def baocun_zhaopin1(com): connection = pymysql.connect( host='localhost', port=3306, user='root', password='990712llb', db='shop', charset='utf8' ) cursor = connection.cursor() sql_insert1 = "insert into xiaoyuanzhaopin values (%s,%s,%s,%s,%s,%s,%s,%s,%s);" if cursor.execute(sql_insert1,com): connection.commit() print("插入数据库成功") else: print("插入数据库失败") connection.close() def get_zhaopin(url): #爬取招聘信息 driver = webdriver.Chrome() driver.get(url) url_list=[] driver.find_element_by_xpath('//div[@class="company_navs_wrap"]//a[@data-lg-tj-no="0002"]').click() driver.implicitly_wait(10) txt1 = driver.page_source soup1 = etree.HTML(txt1) gonsi2 = soup1.xpath('//div[@class="company_main"]/h1//text()') gonsi='' for c in gonsi2: gonsi=gonsi+c for i in range(1,5): try: driver.implicitly_wait(10) driver.find_element_by_xpath('//div[@class="pages"]//span[@class="next"]').click() driver.implicitly_wait(10) txt1 = driver.page_source soup1 = etree.HTML(txt1) name1 = soup1.xpath('//li[@class="con_list_item default_list"]/@data-positionname') sum1 = soup1.xpath('//li[@class="con_list_item default_list"]//a/@href') print(sum1, name1) driver.implicitly_wait(10) for urla in sum1: url_list.append(urla) except: break print(url_list,len(url_list)) #得到每个岗位的url for urls in url_list: driver.get(urls) driver.implicitly_wait(10) txt=driver.page_source soup=etree.HTML(txt) try: scom = re.search( '

.?
(.?)招聘.?(.?).?(.?).?/(.?) /.?(.?)(.*?)及以上', txt, re.S) gonsi1=scom.group(1) gonsi=gonsi.replace('\n','').replace(' ','') ganwei=scom.group(2)
        daiyu1=soup.xpath('//dd[@class="job-advantage"]//text()')
        # daiyu=daiyu[5:]
        daiyu=''
        for a in daiyu1:
            daiyu=daiyu+a
        daiyu=daiyu.replace('\n','').replace(' ','')
        miaosu1=soup.xpath('//dd[@class="job_bt"]//text()')
        # miaosu=miaosu[5:]
        miaosu=""
        for b in miaosu1:
            miaosu=miaosu+b
        miaosu = miaosu.replace('\n', '').replace(' ', '')
        salary = re.match('^(.*?)k-(.*?)k',scom.group(3))
        salary_min=salary.group(1)+"000"
        salary_max=salary.group(2)+'000'
        dizhi = scom.group(4)
        try:
            jingyan1=re.match('^经验(.*?)-',scom.group(5))
            jingyan = jingyan1.group(1)
        except:
            jingyan="0"
        xueli=scom.group(6)
        doit=(ganwei,gonsi,salary_max,salary_min,dizhi,jingyan,xueli,daiyu,miaosu)
        print(doit)
        baocun_zhaopin(doit)
        time_pl.sleep(1)
    except:
        print(urls)
        # driver.get(urls)
        pass
driver.quit()

def get_zhaopin1(url): #爬取招聘信息 driver = webdriver.Chrome() driver.get(url) url_list=[] driver.find_element_by_xpath('//div[@class="company_navs_wrap"]//a[@data-lg-tj-no="0003"]').click() driver.implicitly_wait(10) txt1 = driver.page_source soup1 = etree.HTML(txt1) gonsi2 = soup1.xpath('//div[@class="company_main"]/h1//text()') gonsi='' for c in gonsi2: gonsi=gonsi+c driver.implicitly_wait(10) soup1 = etree.HTML(txt1) name1 = soup1.xpath('//li[@class="con_list_item default_list"]/@data-positionname') sum1 = soup1.xpath('//li[@class="con_list_item default_list"]//a/@href') print(sum1, name1) driver.implicitly_wait(10) for urla in sum1: url_list.append(urla) print(url_list,len(url_list)) #得到每个岗位的url for urls in url_list: driver.get(urls) driver.implicitly_wait(10) txt=driver.page_source soup=etree.HTML(txt) try: scom = re.search( '

.?
(.?)招聘.?(.?).?(.?).?/(.?) /.?(.?)(.*?)及以上', txt, re.S) gonsi1=scom.group(1) gonsi=gonsi.replace('\n','').replace(' ','') ganwei=scom.group(2) ganwei=ganwei.replace('\n','').replace(' ','')
        daiyu1=soup.xpath('//dd[@class="job-advantage"]//text()')
        # daiyu=daiyu[5:]
        daiyu=''
        for a in daiyu1:
            daiyu=daiyu+a
        daiyu=daiyu.replace('\n','').replace(' ','')
        miaosu1=soup.xpath('//dd[@class="job_bt"]//text()')
        # miaosu=miaosu[5:]
        miaosu=""
        for b in miaosu1:
            miaosu=miaosu+b
        miaosu=miaosu.replace('\n','').replace(' ','')
        salary = re.match('^(.*?)k-(.*?)k',scom.group(3))
        salary_min=salary.group(1)+"000"
        salary_max=salary.group(2)+'000'
        dizhi = scom.group(4)
        try:
            jingyan1=re.match('^经验(.*?)-',scom.group(5))
            jingyan = jingyan1.group(1)
        except:
            jingyan="0"
        xueli=scom.group(6)
        doit=(ganwei,gonsi,salary_max,salary_min,dizhi,jingyan,xueli,daiyu,miaosu)
        print(doit)
        baocun_zhaopin1(doit)
        time_pl.sleep(1)
    except:
        print(urls)
        # driver.get(urls)
        pass
driver.quit()

if name == 'main': base_url = "https://www.lagou.com/gongsi/0-0-0-0" #re_text("https://www.lagou.com/gongsi/22013.html") urls=['https://www.lagou.com/gongsi/34863.html','https://www.lagou.com/gongsi/5834.html','https://www.lagou.com/gongsi/425270.html','https://www.lagou.com/gongsi/8861.html','https://www.lagou.com/gongsi/71182.html','https://www.lagou.com/gongsi/41846.html','https://www.lagou.com/gongsi/31441.html','https://www.lagou.com/gongsi/33219.html','https://www.lagou.com/gongsi/28417.html'] urla=['https://www.lagou.com/gongsi/1499.html','https://www.lagou.com/gongsi/145486.html'] url_com=['https://www.lagou.com/gongsi/34863.html', 'https://www.lagou.com/gongsi/35559.html', 'https://www.lagou.com/gongsi/5834.html', 'https://www.lagou.com/gongsi/95651.html', 'https://www.lagou.com/gongsi/52315.html', 'https://www.lagou.com/gongsi/41846.html', 'https://www.lagou.com/gongsi/425270.html', 'https://www.lagou.com/gongsi/11209.html', 'https://www.lagou.com/gongsi/1291.html', 'https://www.lagou.com/gongsi/1499.html', 'https://www.lagou.com/gongsi/1712.html', 'https://www.lagou.com/gongsi/8861.html', 'https://www.lagou.com/gongsi/145486.html', 'https://www.lagou.com/gongsi/97882.html', 'https://www.lagou.com/gongsi/25068.html', 'https://www.lagou.com/gongsi/33219.html'] for url in url_com: print(url) try: get_zhaopin(url) except: print("失败") pass

""" def get_num(url,title_txt,Sname): #根据此页的url,以及此页的文章标题,将文章保存 driver = webdriver.Firefox() driver.get(url) now_handle = driver.current_window_handle for title in title_txt: driver.find_element_by_link_text(title).click() time_pl.sleep(3) driver.switch_to_window(driver.window_handles[-1]) soup=BeautifulSoup(driver.page_source,'lxml') #get_txt(soup,Sname) driver.close() driver.switch_to_window(now_handle) time_pl.sleep(3)

def get_txt(soup,Sname): #根据传过来的soup,将此文章内容提取出来 title=soup.find_all('div',class_="zt_bookTitiles")[0].get_text() author1='' author2='' author3='' txt=soup.find_all('ul',class_='Buy_detail') txt1=txt[0].find_all('li') try: author1=author1+txt1[0].find_all('a')[0].get_text() except: pass try: author2=author2+txt1[0].find_all('a')[1].get_text() except: pass try: author3=author3+txt1[0].find_all('a')[2].get_text() except: pass push_time=txt1[0].find_all('span')[1].get_text()[5:] baogao_page=txt1[1].find_all('span')[0].get_text()[5:] baogao_size=txt1[1].find_all('span')[1].get_text()[5:] baogao_sum=txt1[2].find_all('span')[0].get_text()[5:] series=txt1[2].find_all('span')[1].get_text()[5:] book=txt1[3].get_text()[5:] visitors=txt1[4].find_all('span')[0].get_text()[5:9].strip() abstract=soup.find_all('div',class_='zt_bookSum_content')[0].get_text() key1='' key2='' key3='' key4='' key5='' key_sum=soup.find_all('div',class_='zt_bookSum_keywords')[0] try: key1=key1+key_sum.find_all('a')[0].get_text() except: pass try: key2=key2+key_sum.find_all('a')[1].get_text() except: pass try: key3=key3+key_sum.find_all('a')[2].get_text() except: pass try: key4=key4+key_sum.find_all('a')[3].get_text() except: pass try: key5=key5+key_sum.find_all('a')[4].get_text() except: pass author_brief=soup.find_all('div',class_='zt_bookSum_content')[1].get_text() global Num url="http://www.jianpincn.com/skwx_jp/County.aspx?name=%25u7800%25u5C71%25u53BF" num=1

#con_url(url,num)

url = 'http://www.jianpincn.com/skwx_jp/County.aspx?name=%25u5CB3%25u897F%25u53BF' num = 101 driver = webdriver.Firefox() driver.get("http://www.jianpincn.com/skwx_jp/DataList.aspx?type=&SubLibraryID=10263&page=1") now_handle = driver.current_window_handle name='农村合作行为的类型学分析——以安徽小岗村为例' driver.find_element_by_link_text(name).click() time_pl.sleep(3) driver.switch_to_window(driver.window_handles[-1]) print (driver.current_url) now_soup = driver.page_source soup = BeautifulSoup(now_soup,'lxml') try: print (soup.find_all('div', class_='zt_bookTitiles')[0].get_text()) driver.close() driver.switch_to_window(now_handle) time_pl.sleep(3) name = '农民合作的条件分析——以安徽省小岗村农业合作社为例' driver.find_element_by_link_text(name).click() time_pl.sleep(3) driver.switch_to_window(driver.window_handles[-1]) print (driver.current_url) now_soup = driver.page_source soup = BeautifulSoup(now_soup, 'lxml') print (soup.find_all('div', class_='zt_bookTitiles')[0].get_text()) driver.close() except: print ('失败') """ `

194 Bytes
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论