您的位置：首页 > 脚本大全 > > 正文

python爬虫开源代码（Python实现的文轩网爬虫完整示例）

更多时间：2021-10-02 01:03:31 类别：脚本大全浏览量：2957

python爬虫开源代码

Python实现的文轩网爬虫完整示例

本文实例讲述了python实现的文轩网爬虫。分享给大家供大家参考，具体如下：

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

encoding=utf8

import pymysql

import time

import sys

import requests

import os

#捕获错误

import traceback

import types

#将html实体化

import cgi

import warnings

reload(sys)

sys.setdefaultencoding('utf-8')

from pyquery import pyquery as pq

from lxml import etree

sys.setdefaultencoding('utf-8')

#屏蔽错误

warnings.filterwarnings("ignore")

#下载图片

def dowloadpic(imageurl,filepath):

r = requests.get(imageurl,timeout=60)

status=r.status_code

if status == 404:

return 404

with open(filepath, "wb") as code:

code.write(r.content)

#根据详情页地址抓取数据并插入数据库

def getdata(final_url):

file_open=open('./url.txt', 'w')

file_open.write(final_url)

file_open.close()

#链接数据库

conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='root', db='bookinfo', charset='utf8')

#设置浮标

cursor = conn.cursor(cursor=pymysql.cursors.dictcursor)

#解析详情页面

try:

detail_url=final_url

c=pq(detail_url)

head=c('html').attr('xmlns')

err='http://www.w3.org/1999/xhtml'

err1='http://www.winxuan.com/cms/2016db_sh'

if head == err or head == err1:

return 'back'

except exception, e:

return 'back'

i=0

while i<12:

text = c('#page').find('.cont').find('li').eq(i).text()

text=text.replace('　','')

if 'i s b n' in text:

isbn=text.replace('i s b n：','')

isbn=isbn.strip()

sel='select count(*) from bi_book where isbn ='+isbn

cursor.execute(sel)

result=cursor.fetchone()

count=result['count(*)']

if count != 0 :

print u'已存在'

return 'back'

if 'isbn：' in text :

isbn=text.replace('isbn：','')

isbn=isbn.strip()

sel='select count(*) from bi_book where isbn ='+isbn

cursor.execute(sel)

result=cursor.fetchone()

count=result['count(*)']

if count != 0 :

print u'已存在'

return 'back'

if '作者：' in text :

author = text.replace('作者：','')

if '出版社：' in text :

press_name=text.replace('出版社：','')

if '版次：' in text :

edition=text.replace('版次：','')

if '印次：' in text :

impressions=text.replace('印次：','')

if '装帧：' in text :

packaging=text.replace('装帧：','')

if '开本：' in text:

size=text.replace('开本：','')

if '出版时间：' in text:

press_time=text.replace('出版时间：','')

press_time=press_time.strip()

if press_time == '无':

press_time='1970-01-01'

if '印刷时间：' in text:

print_time=text.replace('印刷时间：','')

print_time=print_time.strip()

if print_time== '无':

print_time='1970-01-01'

if '页数：' in text:

page_num=text.replace('页数：','')

if '字数：' in text:

word_num=text.replace('字数：','')

i+=1

if ('author' in locals().keys()) == false:

author = ''

if ('press_time' in locals().keys()) == false:

press_time = '1970-01-01'

if ('print_time' in locals().keys()) == false:

print_time = '1970-01-01'

if ('impressions' in locals().keys()) == false:

impressions = ''

if ('edition' in locals().keys())== false:

edition = ''

if ('page_num' in locals().keys())== false:

page_num = ''

if ('word_num' in locals().keys())== false:

word_num = ''

if ('packaging' in locals().keys())== false:

packaging = ''

if ('size' in locals().keys())== false:

size = ''

if ('press_name' in locals().keys())== false:

press_name = ''

#暂无图片地址

none_img='http://static.winxuancdn.com/goods/sml_blank.jpg'

#获取大小图地址

big_path=c('.info-side').find('.img').find('a').find('img').attr('src')

if big_path is none:

return 'back'

elif big_path == none_img :

big_path=''

small_path=''

else :

small_path=big_path.replace('_16','_11')

#获取分类

#先获取a标签html

ahtml=c('#page').find('.base-nav').eq(0).html()

#解析a标签html

cate=pq(ahtml)

#获取分类的最后一个分类

category=cate('a:last').text()

#获取书名

name=c('.info-main').find('.name').eq(0).find('h1').eq(0).text()

name=name.strip()

#获取价格

price=c('.info-main').find('.attr').eq(0).find('.price-n').eq(0).find('b').text()

price=price.replace('¥','')

#循环获取内容简介和目录信息

k=5

while k<12:

title=c('#page').find('.title').eq(k).find('.tab').find('h4').text()

if '内容简介' in title:

con=c('#page').find('.title').eq(k).nextall()

det=pq(con)

content=det('.text-words-1').html()

content=content.encode("utf8", "ignore");

if '目录' in title:

con=c('#page').find('.title').eq(k).nextall()

dry=pq(con)

directory=dry('.text-words-1').html()

directory=directory.encode("utf8", "ignore");

k+=1

#如果内容简介和目录没有的时候指定为空字符串

if ('content' in locals().keys())== false:

content = ''

if ('directory' in locals().keys())== false:

directory = ''

details = '内容简介<br>'+content+'<br><br>目录<br>'+directory

details=cgi.escape(details)

#录入时间

add_time = time.strftime('%y-%m-%d',time.localtime(time.time()))

#下载小图

#文件根目录

root_path=sys.path[0]

#创建isbn文件夹路径

root_path=root_path.replace('\\','/')

isbn_path=root_path+'/download/'+isbn

if big_path != '' and small_path !='' :

#创建isbn目录

if os.path.isdir(isbn_path) ==false :

os.mkdir(isbn_path)

#组合下载后图片保存路径

down_img_small = isbn_path+"/small"+

i
 
  标签：Python 爬虫

上一篇：linux中swap分区的作用是（linux swap交换分区详解）

                	  
			  下一篇：mysql查询慢有哪些原因（MySQL 查询速度慢的原因）

   


  
      您可能感兴趣
				
					
  超简单使用Python换脸实例（超简单使用Python换脸实例）
  python线程池有几种（对python 多线程中的守护线程与join的用法详解）
  python模块学习之random模块（详解Python基础random模块随机数的生成）
  微信昵称python（Python 微信之获取好友昵称并制作wordcloud的实例）
  python3编程过程（Python3模拟登录操作实例分析）
  python云服务技术（Python脚本修改阿里云的访问控制列表的方法）
  python飞机大战游戏背景（python实现飞机大战游戏）
  python监控系统界面（Python远程视频监控程序的实例代码）
  笨办法学python3目录（如何愉快地迁移到 Python 3）
  python爬虫怎么爬取vip资源（Python网络爬虫之爬取微博热搜）
  python网络爬虫案例实战（python爬取cnvd漏洞库信息的实例）
  python怎么操作mysql（详解Python的数据库操作pymysql）
  python numpy 安装（python3.6下Numpy库下载与安装图文教程）
  python报表可视化（使用Python快速制作可视化报表的方法）
  pythondjango后台管理（基于腾讯云服务器部署微信小程序后台服务Python+Django）
  Python实现模拟点击（用python实现刷点击率的示例代码）
高中数学题（高中数学题型总结及解题方法）
冰岛旅游攻略（冰岛旅游攻略及花费）
为什么现在年轻人越来越喜欢买衣服（为什么现在年轻人越来越喜欢买衣服穿）
怎么做好SEO（怎么做好seo内容优化）
冬季钓鱼子线用 长 还是 短（冬季钓鱼子线用）
鱼竿 夏钓短，冬钓长 ，一定是这样 认清优缺点在选竿（鱼竿夏钓短冬钓长）
					
					
            
         
 


        
             

				 
    
        热门推荐
    
    
    
    
       Sql的decimal、float、double类型的区别
mysql中innodb的特性（Mysql InnoDB的锁定机制实例详解）
mysql创建存储过程的代码（MySQL修改存储过程的详细步骤）
python设计一个聊天机器人（手把手教你使用Python创建微信机器人）
vue3 props用法（vue3组合API中setup、 ref、reactive的使用大全）
.NET中IsNullOrEmpty和IsNullOrWhiteSpace的区别
css技巧100个（css小技巧汇总）
nginx跳转规则配置上下文（基于nginx实现上游服务器动态自动上下线无需reload的实现方法）
python实现sql脚本规范（基于Python的SQL Server数据库实现对象同步轻量级）
css分割线使用教程（css实现文章分割线样式的多种方法总结）    

    
   

    


  
   
		排行榜
	
	 
		
       1python连接到本地的mysql数据库（Python实现连接MySql数据库及增删改查操作详解）
2python调用excel教程（利用python在excel里面直接使用sql函数的方法）
3python实现简单加密（Python 隐藏输入密码时屏幕回显的实例）
4python爬取豆瓣评分排行榜（Python爬虫——爬取豆瓣电影Top250代码实例）
5Python实现模拟点击（用python实现刷点击率的示例代码）
6pythondjango后台管理（基于腾讯云服务器部署微信小程序后台服务Python+Django）
7python报表可视化（使用Python快速制作可视化报表的方法）
8python numpy 安装（python3.6下Numpy库下载与安装图文教程）
9python怎么操作mysql（详解Python的数据库操作pymysql）
		
	







  
	 
  
   



	







     
    
	
        首页
            编程学习
            Web前端
            数据库
            软件设计
            
 开心学习 ©2013-2021 保留所有权利