python ($5) · 代码片段

python

` from bs4 import BeautifulSoup import lxml import re import requests import string import json import pymysql from selenium import webdriver import time as time_pl from lxml import etree

def get_html(url): # 伪装成浏览器访问 #Mozilla/5.0 (Windows; U; MISE 9.0; Windows NT 9.0; en-US) #Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 headers = { 'User-Agent': 'Mozilla/5.0 (Windows; U; MISE 9.0; Windows NT 9.0; en-US)'} resp = requests.get(url,headers=headers) resp.encoding = resp.apparent_encoding return resp.text

def get_urls(base_url): #获取公司首页信息 soup = etree.HTML(get_html(base_url)) urls_list = [] sun_a = soup.xpath('//li[@class="company-item"]//p/a/@href') for i in range(0, len(sun_a)): if i % 2 == 0: urls_list.append(sun_a[i]) print(urls_list, len(urls_list)) return (urls_list)

def baochun(stra): connection = pymysql.connect( host='localhost', port=3306, user='root', password='990712llb', db='shop', charset='utf8' ) cursor = connection.cursor() sql_insert1="insert into gonsijiesao values (%s,%s,%s,%s,%s,%s,%s,%s);" if cursor.execute(sql_insert1, stra): connection.commit() print("成功") connection.close()

def get_gonsi(url): #爬取公司信息 driver = webdriver.Chrome() driver.get(url) txt = driver.page_source soup = etree.HTML(txt) scom = re.search( '

.?<a.?>(.?).?

.?(.?).?(.?).?(.?).?(.?).?成立时间.?class="content">(.?)

', txt, re.S) print(scom) name=scom.group(1) name=name.replace('\n', '').replace(' ', '') leixing=scom.group(2) zijin=scom.group(3) guimo_1=re.match('^(.?)-(.*?)人',scom.group(4)) if guimo_1: guimo_min = guimo_1.group(1) guimo_max = guimo_1.group(2) else: guimo_min="2000" guimo_max="100000" dizhi=scom.group(5) time=scom.group(6) time=time[:4] gonsi_info = soup.xpath( '//div[@id="container_left"]//div[@class="company_intro_text"]//text()') info = '' for a in gonsi_info[:-2]: info = info + a.replace('\n', '').replace(' ', '') str_list=(name,info,guimo_max,guimo_min,dizhi,leixing,zijin,time) print(str_list) driver.quit() try: baochun(str_list) print("成功") except: print("失败")

.?
(.?)招聘.?(.?).?(.?).?/(.?) /.?(.?)(.*?)及以上', txt, re.S) gonsi1=scom.group(1) gonsi=gonsi.replace('\n','').replace(' ','') ganwei=scom.group(2)
daiyu1=soup.xpath('//dd[@class="job-advantage"]//text()') # daiyu=daiyu[5:] daiyu='' for a in daiyu1: daiyu=daiyu+a daiyu=daiyu.replace('\n','').replace(' ','') miaosu1=soup.xpath('//dd[@class="job_bt"]//text()') # miaosu=miaosu[5:] miaosu="" for b in miaosu1: miaosu=miaosu+b miaosu = miaosu.replace('\n', '').replace(' ', '') salary = re.match('^(.*?)k-(.*?)k',scom.group(3)) salary_min=salary.group(1)+"000" salary_max=salary.group(2)+'000' dizhi = scom.group(4) try: jingyan1=re.match('^经验(.*?)-',scom.group(5)) jingyan = jingyan1.group(1) except: jingyan="0" xueli=scom.group(6) doit=(ganwei,gonsi,salary_max,salary_min,dizhi,jingyan,xueli,daiyu,miaosu) print(doit) baocun_zhaopin(doit) time_pl.sleep(1) except: print(urls) # driver.get(urls) pass driver.quit()

def get_zhaopin1(url): #爬取招聘信息 driver = webdriver.Chrome() driver.get(url) url_list=[] driver.find_element_by_xpath('//div[@class="company_navs_wrap"]//a[@data-lg-tj-no="0003"]').click() driver.implicitly_wait(10) txt1 = driver.page_source soup1 = etree.HTML(txt1) gonsi2 = soup1.xpath('//div[@class="company_main"]/h1//text()') gonsi='' for c in gonsi2: gonsi=gonsi+c driver.implicitly_wait(10) soup1 = etree.HTML(txt1) name1 = soup1.xpath('//li[@class="con_list_item default_list"]/@data-positionname') sum1 = soup1.xpath('//li[@class="con_list_item default_list"]//a/@href') print(sum1, name1) driver.implicitly_wait(10) for urla in sum1: url_list.append(urla) print(url_list,len(url_list)) #得到每个岗位的url for urls in url_list: driver.get(urls) driver.implicitly_wait(10) txt=driver.page_source soup=etree.HTML(txt) try: scom = re.search( '

.?
(.?)招聘.?(.?).?(.?).?/(.?) /.?(.?)(.*?)及以上', txt, re.S) gonsi1=scom.group(1) gonsi=gonsi.replace('\n','').replace(' ','') ganwei=scom.group(2) ganwei=ganwei.replace('\n','').replace(' ','')
daiyu1=soup.xpath('//dd[@class="job-advantage"]//text()') # daiyu=daiyu[5:] daiyu='' for a in daiyu1: daiyu=daiyu+a daiyu=daiyu.replace('\n','').replace(' ','') miaosu1=soup.xpath('//dd[@class="job_bt"]//text()') # miaosu=miaosu[5:] miaosu="" for b in miaosu1: miaosu=miaosu+b miaosu=miaosu.replace('\n','').replace(' ','') salary = re.match('^(.*?)k-(.*?)k',scom.group(3)) salary_min=salary.group(1)+"000" salary_max=salary.group(2)+'000' dizhi = scom.group(4) try: jingyan1=re.match('^经验(.*?)-',scom.group(5)) jingyan = jingyan1.group(1) except: jingyan="0" xueli=scom.group(6) doit=(ganwei,gonsi,salary_max,salary_min,dizhi,jingyan,xueli,daiyu,miaosu) print(doit) baocun_zhaopin1(doit) time_pl.sleep(1) except: print(urls) # driver.get(urls) pass driver.quit()

if name == 'main': base_url = "https://www.lagou.com/gongsi/0-0-0-0" #re_text("https://www.lagou.com/gongsi/22013.html") urls=['https://www.lagou.com/gongsi/34863.html','https://www.lagou.com/gongsi/5834.html','https://www.lagou.com/gongsi/425270.html','https://www.lagou.com/gongsi/8861.html','https://www.lagou.com/gongsi/71182.html','https://www.lagou.com/gongsi/41846.html','https://www.lagou.com/gongsi/31441.html','https://www.lagou.com/gongsi/33219.html','https://www.lagou.com/gongsi/28417.html'] urla=['https://www.lagou.com/gongsi/1499.html','https://www.lagou.com/gongsi/145486.html'] url_com=['https://www.lagou.com/gongsi/34863.html', 'https://www.lagou.com/gongsi/35559.html', 'https://www.lagou.com/gongsi/5834.html', 'https://www.lagou.com/gongsi/95651.html', 'https://www.lagou.com/gongsi/52315.html', 'https://www.lagou.com/gongsi/41846.html', 'https://www.lagou.com/gongsi/425270.html', 'https://www.lagou.com/gongsi/11209.html', 'https://www.lagou.com/gongsi/1291.html', 'https://www.lagou.com/gongsi/1499.html', 'https://www.lagou.com/gongsi/1712.html', 'https://www.lagou.com/gongsi/8861.html', 'https://www.lagou.com/gongsi/145486.html', 'https://www.lagou.com/gongsi/97882.html', 'https://www.lagou.com/gongsi/25068.html', 'https://www.lagou.com/gongsi/33219.html'] for url in url_com: print(url) try: get_zhaopin(url) except: print("失败") pass

""" def get_num(url,title_txt,Sname): #根据此页的url,以及此页的文章标题,将文章保存 driver = webdriver.Firefox() driver.get(url) now_handle = driver.current_window_handle for title in title_txt: driver.find_element_by_link_text(title).click() time_pl.sleep(3) driver.switch_to_window(driver.window_handles[-1]) soup=BeautifulSoup(driver.page_source,'lxml') #get_txt(soup,Sname) driver.close() driver.switch_to_window(now_handle) time_pl.sleep(3)

def get_txt(soup,Sname): #根据传过来的soup,将此文章内容提取出来 title=soup.find_all('div',class_="zt_bookTitiles")[0].get_text() author1='' author2='' author3='' txt=soup.find_all('ul',class_='Buy_detail') txt1=txt[0].find_all('li') try: author1=author1+txt1[0].find_all('a')[0].get_text() except: pass try: author2=author2+txt1[0].find_all('a')[1].get_text() except: pass try: author3=author3+txt1[0].find_all('a')[2].get_text() except: pass push_time=txt1[0].find_all('span')[1].get_text()[5:] baogao_page=txt1[1].find_all('span')[0].get_text()[5:] baogao_size=txt1[1].find_all('span')[1].get_text()[5:] baogao_sum=txt1[2].find_all('span')[0].get_text()[5:] series=txt1[2].find_all('span')[1].get_text()[5:] book=txt1[3].get_text()[5:] visitors=txt1[4].find_all('span')[0].get_text()[5:9].strip() abstract=soup.find_all('div',class_='zt_bookSum_content')[0].get_text() key1='' key2='' key3='' key4='' key5='' key_sum=soup.find_all('div',class_='zt_bookSum_keywords')[0] try: key1=key1+key_sum.find_all('a')[0].get_text() except: pass try: key2=key2+key_sum.find_all('a')[1].get_text() except: pass try: key3=key3+key_sum.find_all('a')[2].get_text() except: pass try: key4=key4+key_sum.find_all('a')[3].get_text() except: pass try: key5=key5+key_sum.find_all('a')[4].get_text() except: pass author_brief=soup.find_all('div',class_='zt_bookSum_content')[1].get_text() global Num url="http://www.jianpincn.com/skwx_jp/County.aspx?name=%25u7800%25u5C71%25u53BF" num=1

#con_url(url,num)

url = 'http://www.jianpincn.com/skwx_jp/County.aspx?name=%25u5CB3%25u897F%25u53BF' num = 101 driver = webdriver.Firefox() driver.get("http://www.jianpincn.com/skwx_jp/DataList.aspx?type=&SubLibraryID=10263&page=1") now_handle = driver.current_window_handle name='农村合作行为的类型学分析——以安徽小岗村为例' driver.find_element_by_link_text(name).click() time_pl.sleep(3) driver.switch_to_window(driver.window_handles[-1]) print (driver.current_url) now_soup = driver.page_source soup = BeautifulSoup(now_soup,'lxml') try: print (soup.find_all('div', class_='zt_bookTitiles')[0].get_text()) driver.close() driver.switch_to_window(now_handle) time_pl.sleep(3) name = '农民合作的条件分析——以安徽省小岗村农业合作社为例' driver.find_element_by_link_text(name).click() time_pl.sleep(3) driver.switch_to_window(driver.window_handles[-1]) print (driver.current_url) now_soup = driver.page_source soup = BeautifulSoup(now_soup, 'lxml') print (soup.find_all('div', class_='zt_bookTitiles')[0].get_text()) driver.close() except: print ('失败') """ `

`
from bs4 import BeautifulSoup
import lxml
import re
import requests
import string
import json
import pymysql
from selenium import webdriver
import time as time_pl
from lxml import etree

def get_html(url):
    # 伪装成浏览器访问
    #Mozilla/5.0 (Windows; U; MISE 9.0; Windows NT 9.0; en-US)
    #Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows; U; MISE 9.0; Windows NT 9.0; en-US)'}
    resp = requests.get(url,headers=headers)
    resp.encoding = resp.apparent_encoding
    return resp.text

def get_urls(base_url):   #获取公司首页信息
    soup = etree.HTML(get_html(base_url))
    urls_list = []
    sun_a = soup.xpath('//li[@class="company-item"]//p/a/@href')
    for i in range(0, len(sun_a)):
        if i % 2 == 0:
            urls_list.append(sun_a[i])
    print(urls_list, len(urls_list))
    return (urls_list)

def baochun(stra):
    connection = pymysql.connect(
        host='localhost',
        port=3306,
        user='root',
        password='990712llb',
        db='shop',
        charset='utf8'
    )
    cursor = connection.cursor()
    sql_insert1="insert into gonsijiesao values (%s,%s,%s,%s,%s,%s,%s,%s);"
    if cursor.execute(sql_insert1, stra):
        connection.commit()
        print("成功")
    connection.close()

def get_gonsi(url):   #爬取公司信息
    driver = webdriver.Chrome()
    driver.get(url)
    txt = driver.page_source
    soup = etree.HTML(txt)
    scom = re.search(
        '<div class="top_info">.*?<a.*?>(.*?)</a>.*?<div id="container_right">.*?<span>(.*?)</span>.*?<span>(.*?)</span>.*?<span>(.*?)</span>.*?<span>(.*?)</span>.*?成立时间.*?class="content">(.*?)</div>',
        txt,
        re.S)
    print(scom)
    name=scom.group(1)
    name=name.replace('\n', '').replace(' ', '')
    leixing=scom.group(2)
    zijin=scom.group(3)
    guimo_1=re.match('^(.*?)-(.*?)人',scom.group(4))
    if guimo_1:
        guimo_min = guimo_1.group(1)
        guimo_max = guimo_1.group(2)
    else:
        guimo_min="2000"
        guimo_max="100000"
    dizhi=scom.group(5)
    time=scom.group(6)
    time=time[:4]
    gonsi_info = soup.xpath(
        '//div[@id="container_left"]//div[@class="company_intro_text"]//text()')
    info = ''
    for a in gonsi_info[:-2]:
        info = info + a.replace('\n', '').replace(' ', '')
    str_list=(name,info,guimo_max,guimo_min,dizhi,leixing,zijin,time)
    print(str_list)
    driver.quit()
    try:
        baochun(str_list)
        print("成功")
    except:
        print("失败")

def baocun_zhaopin(com):
    connection = pymysql.connect(
        host='localhost',
        port=3306,
        user='root',
        password='990712llb',
        db='shop',
        charset='utf8'
    )
    cursor = connection.cursor()
    sql_insert1 = "insert into shehuizhaopin values (%s,%s,%s,%s,%s,%s,%s,%s,%s);"
    if cursor.execute(sql_insert1,com):
        connection.commit()
        print("插入数据库成功")
    else:
        print("插入数据库失败")
    connection.close()
def baocun_zhaopin1(com):
    connection = pymysql.connect(
        host='localhost',
        port=3306,
        user='root',
        password='990712llb',
        db='shop',
        charset='utf8'
    )
    cursor = connection.cursor()
    sql_insert1 = "insert into xiaoyuanzhaopin values (%s,%s,%s,%s,%s,%s,%s,%s,%s);"
    if cursor.execute(sql_insert1,com):
        connection.commit()
        print("插入数据库成功")
    else:
        print("插入数据库失败")
    connection.close()
def get_zhaopin(url):   #爬取招聘信息
    driver = webdriver.Chrome()
    driver.get(url)
    url_list=[]
    driver.find_element_by_xpath('//div[@class="company_navs_wrap"]//a[@data-lg-tj-no="0002"]').click()
    driver.implicitly_wait(10)
    txt1 = driver.page_source
    soup1 = etree.HTML(txt1)
    gonsi2 = soup1.xpath('//div[@class="company_main"]/h1//text()')
    gonsi=''
    for c in gonsi2:
        gonsi=gonsi+c
    for i in range(1,5):
        try:
            driver.implicitly_wait(10)
            driver.find_element_by_xpath('//div[@class="pages"]//span[@class="next"]').click()
            driver.implicitly_wait(10)
            txt1 = driver.page_source
            soup1 = etree.HTML(txt1)
            name1 = soup1.xpath('//li[@class="con_list_item default_list"]/@data-positionname')
            sum1 = soup1.xpath('//li[@class="con_list_item default_list"]//a/@href')
            print(sum1, name1)
            driver.implicitly_wait(10)
            for urla in sum1:
                url_list.append(urla)
        except:
            break
    print(url_list,len(url_list))  #得到每个岗位的url
    for urls in url_list:
        driver.get(urls)
        driver.implicitly_wait(10)
        txt=driver.page_source
        soup=etree.HTML(txt)
        try:
            scom = re.search(
                '<div class="position-content-l">.*?<div class="company">(.*?)招聘.*?<span class="name">(.*?)</span>.*?<span class="salary">(.*?)</span>.*?<span>/(.*?) /</span>.*?<span>(.*?)<span>(.*?)及以上',
                txt,
                re.S)
            gonsi1=scom.group(1)
            gonsi=gonsi.replace('\n','').replace(' ','')
            ganwei=scom.group(2)

daiyu1=soup.xpath('//dd[@class="job-advantage"]//text()')
            # daiyu=daiyu[5:]
            daiyu=''
            for a in daiyu1:
                daiyu=daiyu+a
            daiyu=daiyu.replace('\n','').replace(' ','')
            miaosu1=soup.xpath('//dd[@class="job_bt"]//text()')
            # miaosu=miaosu[5:]
            miaosu=""
            for b in miaosu1:
                miaosu=miaosu+b
            miaosu = miaosu.replace('\n', '').replace(' ', '')
            salary = re.match('^(.*?)k-(.*?)k',scom.group(3))
            salary_min=salary.group(1)+"000"
            salary_max=salary.group(2)+'000'
            dizhi = scom.group(4)
            try:
                jingyan1=re.match('^经验(.*?)-',scom.group(5))
                jingyan = jingyan1.group(1)
            except:
                jingyan="0"
            xueli=scom.group(6)
            doit=(ganwei,gonsi,salary_max,salary_min,dizhi,jingyan,xueli,daiyu,miaosu)
            print(doit)
            baocun_zhaopin(doit)
            time_pl.sleep(1)
        except:
            print(urls)
            # driver.get(urls)
            pass
    driver.quit()

def get_zhaopin1(url):   #爬取招聘信息
    driver = webdriver.Chrome()
    driver.get(url)
    url_list=[]
    driver.find_element_by_xpath('//div[@class="company_navs_wrap"]//a[@data-lg-tj-no="0003"]').click()
    driver.implicitly_wait(10)
    txt1 = driver.page_source
    soup1 = etree.HTML(txt1)
    gonsi2 = soup1.xpath('//div[@class="company_main"]/h1//text()')
    gonsi=''
    for c in gonsi2:
        gonsi=gonsi+c
    driver.implicitly_wait(10)
    soup1 = etree.HTML(txt1)
    name1 = soup1.xpath('//li[@class="con_list_item default_list"]/@data-positionname')
    sum1 = soup1.xpath('//li[@class="con_list_item default_list"]//a/@href')
    print(sum1, name1)
    driver.implicitly_wait(10)
    for urla in sum1:
        url_list.append(urla)
    print(url_list,len(url_list))  #得到每个岗位的url
    for urls in url_list:
        driver.get(urls)
        driver.implicitly_wait(10)
        txt=driver.page_source
        soup=etree.HTML(txt)
        try:
            scom = re.search(
                '<div class="position-content-l">.*?<div class="company">(.*?)招聘.*?<span class="name">(.*?)</span>.*?<span class="salary">(.*?)</span>.*?<span>/(.*?) /</span>.*?<span>(.*?)<span>(.*?)及以上',
                txt,
                re.S)
            gonsi1=scom.group(1)
            gonsi=gonsi.replace('\n','').replace(' ','')
            ganwei=scom.group(2)
            ganwei=ganwei.replace('\n','').replace(' ','')

daiyu1=soup.xpath('//dd[@class="job-advantage"]//text()')
            # daiyu=daiyu[5:]
            daiyu=''
            for a in daiyu1:
                daiyu=daiyu+a
            daiyu=daiyu.replace('\n','').replace(' ','')
            miaosu1=soup.xpath('//dd[@class="job_bt"]//text()')
            # miaosu=miaosu[5:]
            miaosu=""
            for b in miaosu1:
                miaosu=miaosu+b
            miaosu=miaosu.replace('\n','').replace(' ','')
            salary = re.match('^(.*?)k-(.*?)k',scom.group(3))
            salary_min=salary.group(1)+"000"
            salary_max=salary.group(2)+'000'
            dizhi = scom.group(4)
            try:
                jingyan1=re.match('^经验(.*?)-',scom.group(5))
                jingyan = jingyan1.group(1)
            except:
                jingyan="0"
            xueli=scom.group(6)
            doit=(ganwei,gonsi,salary_max,salary_min,dizhi,jingyan,xueli,daiyu,miaosu)
            print(doit)
            baocun_zhaopin1(doit)
            time_pl.sleep(1)
        except:
            print(urls)
            # driver.get(urls)
            pass
    driver.quit()

if __name__ == '__main__':
    base_url = "https://www.lagou.com/gongsi/0-0-0-0"
    #re_text("https://www.lagou.com/gongsi/22013.html")
    urls=['https://www.lagou.com/gongsi/34863.html','https://www.lagou.com/gongsi/5834.html','https://www.lagou.com/gongsi/425270.html','https://www.lagou.com/gongsi/8861.html','https://www.lagou.com/gongsi/71182.html','https://www.lagou.com/gongsi/41846.html','https://www.lagou.com/gongsi/31441.html','https://www.lagou.com/gongsi/33219.html','https://www.lagou.com/gongsi/28417.html']
    urla=['https://www.lagou.com/gongsi/1499.html','https://www.lagou.com/gongsi/145486.html']
    url_com=['https://www.lagou.com/gongsi/34863.html', 'https://www.lagou.com/gongsi/35559.html', 'https://www.lagou.com/gongsi/5834.html', 'https://www.lagou.com/gongsi/95651.html', 'https://www.lagou.com/gongsi/52315.html', 'https://www.lagou.com/gongsi/41846.html', 'https://www.lagou.com/gongsi/425270.html', 'https://www.lagou.com/gongsi/11209.html', 'https://www.lagou.com/gongsi/1291.html', 'https://www.lagou.com/gongsi/1499.html', 'https://www.lagou.com/gongsi/1712.html', 'https://www.lagou.com/gongsi/8861.html', 'https://www.lagou.com/gongsi/145486.html', 'https://www.lagou.com/gongsi/97882.html', 'https://www.lagou.com/gongsi/25068.html', 'https://www.lagou.com/gongsi/33219.html']
    for url in url_com:
        print(url)
        try:
            get_zhaopin(url)
        except:
            print("失败")
            pass

"""
def get_num(url,title_txt,Sname):   #根据此页的url,以及此页的文章标题,将文章保存
    driver = webdriver.Firefox()
    driver.get(url)
    now_handle = driver.current_window_handle
    for title in title_txt:
        driver.find_element_by_link_text(title).click()
        time_pl.sleep(3)
        driver.switch_to_window(driver.window_handles[-1])
        soup=BeautifulSoup(driver.page_source,'lxml')
        #get_txt(soup,Sname)
        driver.close()
        driver.switch_to_window(now_handle)
        time_pl.sleep(3)

def get_txt(soup,Sname):  #根据传过来的soup,将此文章内容提取出来
    title=soup.find_all('div',class_="zt_bookTitiles")[0].get_text()
    author1=''
    author2=''
    author3=''
    txt=soup.find_all('ul',class_='Buy_detail')
    txt1=txt[0].find_all('li')
    try:
        author1=author1+txt1[0].find_all('a')[0].get_text()
    except:
        pass
    try:
        author2=author2+txt1[0].find_all('a')[1].get_text()
    except:
        pass
    try:
        author3=author3+txt1[0].find_all('a')[2].get_text()
    except:
        pass
    push_time=txt1[0].find_all('span')[1].get_text()[5:]
    baogao_page=txt1[1].find_all('span')[0].get_text()[5:]
    baogao_size=txt1[1].find_all('span')[1].get_text()[5:]
    baogao_sum=txt1[2].find_all('span')[0].get_text()[5:]
    series=txt1[2].find_all('span')[1].get_text()[5:]
    book=txt1[3].get_text()[5:]
    visitors=txt1[4].find_all('span')[0].get_text()[5:9].strip()
    abstract=soup.find_all('div',class_='zt_bookSum_content')[0].get_text()
    key1=''
    key2=''
    key3=''
    key4=''
    key5=''
    key_sum=soup.find_all('div',class_='zt_bookSum_keywords')[0]
    try:
        key1=key1+key_sum.find_all('a')[0].get_text()
    except:
        pass
    try:
        key2=key2+key_sum.find_all('a')[1].get_text()
    except:
        pass
    try:
        key3=key3+key_sum.find_all('a')[2].get_text()
    except:
        pass
    try:
        key4=key4+key_sum.find_all('a')[3].get_text()
    except:
        pass
    try:
        key5=key5+key_sum.find_all('a')[4].get_text()
    except:
        pass
    author_brief=soup.find_all('div',class_='zt_bookSum_content')[1].get_text()
    global Num
url="http://www.jianpincn.com/skwx_jp/County.aspx?name=%25u7800%25u5C71%25u53BF"
num=1
#con_url(url,num)
------------------------------------------------------------------------------------------------
url = 'http://www.jianpincn.com/skwx_jp/County.aspx?name=%25u5CB3%25u897F%25u53BF'
num = 101
driver = webdriver.Firefox()
driver.get("http://www.jianpincn.com/skwx_jp/DataList.aspx?type=&SubLibraryID=10263&page=1")
now_handle = driver.current_window_handle
name='农村合作行为的类型学分析——以安徽小岗村为例'
driver.find_element_by_link_text(name).click()
time_pl.sleep(3)
driver.switch_to_window(driver.window_handles[-1])
print (driver.current_url)
now_soup = driver.page_source
soup = BeautifulSoup(now_soup,'lxml')
try:
    print (soup.find_all('div', class_='zt_bookTitiles')[0].get_text())
    driver.close()
    driver.switch_to_window(now_handle)
    time_pl.sleep(3)
    name = '农民合作的条件分析——以安徽省小岗村农业合作社为例'
    driver.find_element_by_link_text(name).click()
    time_pl.sleep(3)
    driver.switch_to_window(driver.window_handles[-1])
    print (driver.current_url)
    now_soup = driver.page_source
    soup = BeautifulSoup(now_soup, 'lxml')
    print (soup.find_all('div', class_='zt_bookTitiles')[0].get_text())
    driver.close()
except:
    print ('失败')
"""
`

194 Bytes

请注册或者后发表评论