做淘宝电商比较厉害的网站博物馆网站建设的目标

当前位置: 首页 > news >正文

做淘宝电商比较厉害的网站,博物馆网站建设的目标,app定制软件,网页版梦幻西游手游官网为了爬取的高效性#xff0c;实现的过程中我利用了python的threading模块#xff0c;下面是threads.py模块#xff0c;定义了下载解析页面的线程#xff0c;下载图片的线程以及线程池 import threading import urllib2 import Queue import re thread_lock threading.RL… 为了爬取的高效性实现的过程中我利用了python的threading模块下面是threads.py模块定义了下载解析页面的线程下载图片的线程以及线程池 import threading import urllib2 import Queue import re thread_lock threading.RLock() #下载页面的一个函数header中没有任何内容也可以顺利的下载就省去了 def download_page(html_url): try: req urllib2.Request(html_url) response urllib2.urlopen(req) page response.read() return page except Exception: print ‘download %s failed’ % html_url return None #下载图片的一个方法和上面的函数很像只不过添加了一个文件头 #因为在测试的过程中发现天涯对于没有如下文件头的图片链接是不会返回正确的图片的 def download_image(image_url, referer): try: req urllib2.Request(image_url) req.add_header(‘Host’, ‘img3.laibafile.cn’) req.add_header(‘User-Agent’, ‘Mozilla/5.0 (Windows NT 6.3; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0’) req.add_header(‘Accept’, ‘image/png,image/;q0.8,/*;q0.5’) req.add_header(‘Accept-Language’, ‘zh-cn,zh;q0.8,en-us;q0.5,en;q0.3’) req.add_header(‘Referer’, referer) req.add_header(‘Origin’, ‘http://bbs.tianya.cn’) req.add_header(‘Connection’, ‘keep-alive’) response urllib2.urlopen(req) image response.read() return image except Exception: print ‘download %s failed’ % image_url return None #下载和解析一个页面的线程类 class download_html_page(threading.Thread): #name:线程的名字 #page_range:用户输入的页面范围 #page_contents:解析之后楼主的内容 #img_urls:解析之后楼主贴的图的链接 #html_url:输入的页面url #first_page第一次已经下载好的页面主要是考虑效率不重复下载 def init(self, name, page_range, page_contents, img_urls, html_url, first_page): threading.Thread.init(self) self.name name self.page_range page_range self.page_contents page_contents self.img_urls img_urls self.html_url html_url self.first_page first_page #判断是不是楼主的内容
def is_louzhu(self, s): result re.search(r!– div classhost-ico(.*?)/div –, s, re.S) return (result is not None) #获得页面里属于楼主图片的url
def get_img_url(self, s, page_url): #判断是不是楼主给其他用户的评论如果是的话直接过滤掉本人从不看评论 is_louzhu_answer re.search(r-{15,}br, s, re.S) if is_louzhu_answer is None: imgurl re.findall(rimg.*?original(?Pimgurl.*?).*?/br, s, flags re.S) url_path [] for one_url in imgurl: self.img_urls.put(one_url | page_url) path re.search(\w\.jpg, one_url).group(0) url_path.append(img/ path) segments re.split(rimg .*?/br, s.strip()) content segments[0].strip() for i in range(len(url_path)): content \nimg src url_path[i] /\nbr content segments[i1].strip() return content #解析夜歌页面
def parse_page(self, html_page, page_url): html_page.decode(utf-8) Items re.findall(rdiv classatl-content(?Pislouzhu.?)div classbbs-content.*?(?Pcontent.?)/div, html_page, re.S) page_content for item in Items: if self.is_louzhu(item[0]): one_div self.get_img_url(item[1], page_url) if one_div is not None: page_content one_div return page_content def run(self): while self.page_range.qsize() 0: page_number self.page_range.get() page_url re.sub(-(\d?)\.shtml, - str(page_number) .shtml, self.html_url) page_content print thread %s is downloading %s % (self.name, page_url) if page_url self.html_url: page_content self.parse_page(self.first_page, page_url) else: page download_page(page_url) if page is not None: page_content self.parse_page(page, page_url) #thread_lock.acquire() #self.page_contents[page_number] page_content #thread_lock.release() self.page_contents.put(page_content, page_number) self.img_urls.put(finished) #下载图片的线程 class fetch_img(threading.Thread): def init(self, name, img_urls, download_img): threading.Thread.init(self) self.name name self.img_urls img_urls self.download_img download_img def run(self): while True: message self.img_urls.get().split(|) img_url message[0] if img_url finished: self.img_urls.put(finished) break else: thread_lock.acquire() if img_url in self.download_img: thread_lock.release() continue else: thread_lock.release() print fetching image %s % img_url referer message[1] image download_image(img_url, referer) image_name re.search(\w\.jpg, img_url).group(0) with open(rimg\%s % image_name, wb) as img: img.write(image) thread_lock.acquire() self.download_img.add(img_url) thread_lock.release() #定义了一个线程池 class thread_pool: def init(self, page_range, page_contents, html_url, first_page): self.page_range page_range self.page_contents page_contents self.img_urls Queue.Queue() self.html_url html_url self.first_page first_page self.download_img set() self.page_thread_pool [] self.image_thread_pool [] def build_thread(self, page, image): for i in range(page): t download_html_page(page thread%d % i, self.page_range, self.page_contents, self.img_urls, self.html_url, self.first_page) self.page_thread_pool.append(t) for i in range(image): t fetch_img(image thread%d % i, self.img_urls, self.download_img) self.image_thread_pool.append(t) def all_start(self): for t in self.page_thread_pool: t.start() for t in self.image_thread_pool: t.start() def all_join(self): for t in self.page_thread_pool: t.join() for t in self.image_thread_pool: t.join() 下面是主线程的代码

-- coding: utf-8 -- import re import Queue import threads

if namemain’: html_url raw_input(enter the url: ) html_page threads.download_page(html_url) max_page 0
title
if html_page is not None: search_title re.search(rspan classs_titlespan style\S?(?Ptitle.?)/span/span, html_page, re.S) title search_title.groupdict()[title] search_page re.findall(ra href/post-\S?-\d?-(?Ppage\d?)\.shtml(?Ppage)/a, html_page, re.S) for page_number in search_page: page_number int(page_number) if page_number max_page: max_page page_number print title%s % title
print max page number: %s % max_page start_page 0
while start_page 1 or start_page max_page: start_page input(input the start page number:) end_page 0
while end_page start_page or end_page max_page: end_page input(input the end page number:) page_range Queue.Queue()
for i in range(start_page, end_page 1): page_range.put(i) page_contents {}
thread_pool threads.thread_pool(page_range, page_contents, html_url, html_page)
thread_pool.build_thread(1, 1)
thread_pool.all_start()
thread_pool.all_join() 本文仅作项目练习且勿商用 由于文章篇幅有限文档资料内容较多需要这些文档的朋友可以加小助手微信免费获取【保证100%免费】中国人不骗中国人。 全套Python学习资料分享 一、Python所有方向的学习路线 Python所有方向路线就是把Python常用的技术点做整理形成各个领域的知识点汇总它的用处就在于你可以按照上面的知识点去找对应的学习资源保证自己学得较为全面。 二、学习软件 工欲善其事必先利其器。学习Python常用的开发软件都在这里了还有环境配置的教程给大家节省了很多时间。 三、全套PDF电子书 书籍的好处就在于权威和体系健全刚开始学习的时候你可以只看视频或者听某个人讲课但等你学完之后你觉得你掌握了这时候建议还是得去看一下书籍看权威技术书籍也是每个程序员必经之路。 四、入门学习视频全套 我们在看视频学习的时候不能光动眼动脑不动手比较科学的学习方法是在理解之后运用它们这时候练手项目就很适合了。 五、实战案例 光学理论是没用的要学会跟着一起敲要动手实操才能将自己的所学运用到实际当中去这时候可以搞点实战案例来学习。 今天就分享到这里啦感谢大家收看