#coding:utf-8
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import requests
import sys,os
import django
import urllib,urllib2
reload(sys)
sys.setdefaultencoding('utf8')
sys.path.append('/inzhua/inzhua/')
# os.environ['DJANGO_SETTINGS_MODULE']='inzhua.settings'
# django.setup()
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "inzhua.settings")
from django.core.management import execute_from_command_line
execute_from_command_line(sys.argv)
from inzhua_main.models import *
LOG_PATH = '/inzhua/log/'
BASEURL='http://www.creditchina.gov.cn'
HTML_PATH = '/inzhua/inzhua/inzhua_main/spider/creditchina/'
lvshi_url ='http://www.lvshiminglu.com/category/shanghai/page/222'
main_url ='http://www.lvshiminglu.com'
lvshi_path = '/inzhua/inzhua/inzhua_main/spider/lvshiminglu/'
log_file = open(LOG_PATH + 'lvshiminglu.log', 'a')
def common(url):
if not os.path.exists(LOG_PATH):
os.makedirs(LOG_PATH)
if not os.path.exists(lvshi_path):
os.makedirs(lvshi_path)
time.sleep(4)
re = requests.get(url)
ll = url.split('//')[-1].split('/')[-3:]
file_name=''.join(i for i in ll)+'.html'
with open(lvshi_path+file_name,'w') as f:
f.write(re.text)
soup = BeautifulSoup(re.text,'lxml')
return soup
def get_main_url():
widget_categories=common(main_url).select('.widget_categories')[0]
if len(widget_categories.contents) > 1:
results = widget_categories.find_all('li')
for result in results:
count = list(result.strings)[-1]
a_tag = result.find('a')
href=a_tag.get('href')
name = a_tag.string
ProvinceLawOffice.objects.create(
url=href,
name=name,
count= count.split('(')[-1].split(')')[0]
)
def get_404():
print common(main_url).title.string #律师名录网» 未找到页面
# get_404()
def get_lvshi_main_info():
provinceLawOffices = ProvinceLawOffice.objects.all()
for plo in provinceLawOffices:
if plo.exceptPage=='':
i=1
else:
i=int(plo.exceptPage.split(',')[0])+1
target_url = plo.url+'/page/'+str(i)
soup = common(target_url)
while soup.title.string !='律师名录网» 未找到页面':
try:
archive= soup.find(id='archive')
li_list = archive.find_all('li')
for li in li_list:
try:
h3 = li.find('h3')
name = list(h3.strings)[-1]
url = h3.find('a').get('href')
excerpt = li.select('.excerpt')[0].find('p').string
divs = li.select('.small')
if len(divs)==2:
classify=''
for string in divs[0].strings:
classify+=string
tag =''
for string in divs[1].strings:
tag+=string
tag_url = divs[1].find('a').get('href')
else:
classify=''
tag=''
tag_url=''
# print name, url, excerpt, classify,tag, tag_url
LawOfficeMainInfo.objects.create(
provinceLawOffice=plo,
name= name.strip(),
url=url,
excerpt=excerpt,
classify=classify.strip(),
tag=tag.strip(),
tag_url=tag_url,
)
except Exception as e:
print 'error at 1',e
plo.exceptPage=plo.exceptPage+str(i)+','
plo.save()
except Exception as e:
print 'error at 2,',e
plo.exceptPage = plo.exceptPage + str(i) + ','
plo.save()
i+=1
target_url = plo.url + '/page/' + str(i)
print target_url
soup = common(target_url)
plo.exceptPage = plo.exceptPage + str(i) + ','
plo.save()
print('current page is ',str(i))
plo.isVisited=1
plo.save()
# get_lvshi_main_info()
def get_lvshi_detail_info(i,process_num):
lawOffices = LawOfficeMainInfo.objects.filter(isVisited=0)
slice_len = lawOffices.count()/process_num
if i+1<process_num:
lawOffices = lawOffices[i*slice_len:(i+1)*slice_len]
else:
lawOffices = lawOffices[i*slice_len:]
for lo in lawOffices:
print('current is %d name is %s' %(lo.id,lo.name))
soup = common(lo.url)
try:
content = soup.find(id='content').select('.content')[0]
for string in content.strings:
string = string.strip()
if string != '':
LawOfficeDetailInfo.objects.create(
lawOffice_id=lo.id,
info=string
)
except Exception as e:
print ('error at %d name is %s' %(lo.id,lo.name))
lo.isFailed=1
lo.save()
lo.isVisited=1
lo.save()
from multiprocessing import Pool
import os, time, random
if __name__=='__main__':
print 'Parent process %s.' % os.getpid()
p =Pool()
process_num = 10
for i in range(process_num):
p.apply_async(get_lvshi_detail_info, args=(i,process_num,))
print 'Waiting for all subprocesses done...'
p.close()
p.join()
print 'All subprocesses done.'
# get_lvshi_detail_info()
爬取律师名录网站(www.lvshiminglu.com)上所有律所信息
最后编辑于 :
©著作权归作者所有,转载或内容合作请联系作者
- 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
- 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
- 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
推荐阅读更多精彩内容
- 下面选了最近十年里,十位名人所做的毕业演讲。那么多的故事与经历,其实只想告诉你一件事: 面对迷茫和不确定的未来,我...
- 方法一 从上边的打印结果可以看出当前面为1.3的时候最后一位为四舍五入, 当前面为1.4的时候最后一位为五舍六入,...