python爬虫开源代码(Python实现的文轩网爬虫完整示例)
类别:脚本大全 浏览量:2957
时间:2021-10-02 01:03:31 python爬虫开源代码
Python实现的文轩网爬虫完整示例本文实例讲述了python实现的文轩网爬虫。分享给大家供大家参考,具体如下:
|
encoding = utf8 import pymysql import time import sys import requests import os #捕获错误 import traceback import types #将html实体化 import cgi import warnings reload (sys) sys.setdefaultencoding( 'utf-8' ) from pyquery import pyquery as pq from lxml import etree sys.setdefaultencoding( 'utf-8' ) #屏蔽错误 warnings.filterwarnings( "ignore" ) #下载图片 def dowloadpic(imageurl,filepath): r = requests.get(imageurl,timeout = 60 ) status = r.status_code if status = = 404 : return 404 with open (filepath, "wb" ) as code: code.write(r.content) #根据详情页地址抓取数据并插入数据库 def getdata(final_url): file_open = open ( './url.txt' , 'w' ) file_open.write(final_url) file_open.close() #链接数据库 conn = pymysql.connect(host = '127.0.0.1' , port = 3306 , user = 'root' , passwd = 'root' , db = 'bookinfo' , charset = 'utf8' ) #设置浮标 cursor = conn.cursor(cursor = pymysql.cursors.dictcursor) #解析详情页面 try : detail_url = final_url c = pq(detail_url) head = c( 'html' ).attr( 'xmlns' ) err = 'http://www.w3.org/1999/xhtml' err1 = 'http://www.winxuan.com/cms/2016db_sh' if head = = err or head = = err1: return 'back' except exception, e: return 'back' i = 0 while i< 12 : text = c( '#page' ).find( '.cont' ).find( 'li' ).eq(i).text() text = text.replace( ' ' ,'') if 'i s b n' in text: isbn = text.replace( 'i s b n:' ,'') isbn = isbn.strip() sel = 'select count(*) from bi_book where isbn =' + isbn cursor.execute(sel) result = cursor.fetchone() count = result[ 'count(*)' ] if count ! = 0 : print u '已存在' return 'back' if 'isbn:' in text : isbn = text.replace( 'isbn:' ,'') isbn = isbn.strip() sel = 'select count(*) from bi_book where isbn =' + isbn cursor.execute(sel) result = cursor.fetchone() count = result[ 'count(*)' ] if count ! = 0 : print u '已存在' return 'back' if '作者:' in text : author = text.replace( '作者:' ,'') if '出版社:' in text : press_name = text.replace( '出版社:' ,'') if '版次:' in text : edition = text.replace( '版次:' ,'') if '印次:' in text : impressions = text.replace( '印次:' ,'') if '装帧:' in text : packaging = text.replace( '装帧:' ,'') if '开本:' in text: size = text.replace( '开本:' ,'') if '出版时间:' in text: press_time = text.replace( '出版时间:' ,'') press_time = press_time.strip() if press_time = = '无' : press_time = '1970-01-01' if '印刷时间:' in text: print_time = text.replace( '印刷时间:' ,'') print_time = print_time.strip() if print_time = = '无' : print_time = '1970-01-01' if '页数:' in text: page_num = text.replace( '页数:' ,'') if '字数:' in text: word_num = text.replace( '字数:' ,'') i + = 1 if ( 'author' in locals ().keys()) = = false: author = '' if ( 'press_time' in locals ().keys()) = = false: press_time = '1970-01-01' if ( 'print_time' in locals ().keys()) = = false: print_time = '1970-01-01' if ( 'impressions' in locals ().keys()) = = false: impressions = '' if ( 'edition' in locals ().keys()) = = false: edition = '' if ( 'page_num' in locals ().keys()) = = false: page_num = '' if ( 'word_num' in locals ().keys()) = = false: word_num = '' if ( 'packaging' in locals ().keys()) = = false: packaging = '' if ( 'size' in locals ().keys()) = = false: size = '' if ( 'press_name' in locals ().keys()) = = false: press_name = '' #暂无图片地址 none_img = 'http://static.winxuancdn.com/goods/sml_blank.jpg' #获取大小图地址 big_path = c( '.info-side' ).find( '.img' ).find( 'a' ).find( 'img' ).attr( 'src' ) if big_path is none: return 'back' elif big_path = = none_img : big_path = '' small_path = '' else : small_path = big_path.replace( '_16' , '_11' ) #获取分类 #先获取a标签html ahtml = c( '#page' ).find( '.base-nav' ).eq( 0 ).html() #解析a标签html cate = pq(ahtml) #获取分类的最后一个分类 category = cate( 'a:last' ).text() #获取书名 name = c( '.info-main' ).find( '.name' ).eq( 0 ).find( 'h1' ).eq( 0 ).text() name = name.strip() #获取价格 price = c( '.info-main' ).find( '.attr' ).eq( 0 ).find( '.price-n' ).eq( 0 ).find( 'b' ).text() price = price.replace( '¥' ,'') #循环获取内容简介和目录信息 k = 5 while k< 12 : title = c( '#page' ).find( '.title' ).eq(k).find( '.tab' ).find( 'h4' ).text() if '内容简介' in title: con = c( '#page' ).find( '.title' ).eq(k).nextall() det = pq(con) content = det( '.text-words-1' ).html() content = content.encode( "utf8" , "ignore" ); if '目录' in title: con = c( '#page' ).find( '.title' ).eq(k).nextall() dry = pq(con) directory = dry( '.text-words-1' ).html() directory = directory.encode( "utf8" , "ignore" ); k + = 1 #如果内容简介和目录没有的时候指定为空字符串 if ( 'content' in locals ().keys()) = = false: content = '' if ( 'directory' in locals ().keys()) = = false: directory = '' details = '内容简介<br>' + content + '<br><br>目录<br>' + directory details = cgi.escape(details) #录入时间 add_time = time.strftime( '%y-%m-%d' ,time.localtime(time.time())) #下载小图 #文件根目录 root_path = sys.path[ 0 ] #创建isbn文件夹路径 root_path = root_path.replace( '\\',' / ') isbn_path = root_path + '/download/' + isbn if big_path ! = ' ' and small_path !=' ' : #创建isbn目录 if os.path.isdir(isbn_path) = = false : os.mkdir(isbn_path) #组合下载后图片保存路径 down_img_small = isbn_path + "/small" + i
|