您的位置：首页 > 脚本大全 > > 正文

python网络爬虫案例实战（python爬取cnvd漏洞库信息的实例）

更多时间：2022-03-31 12:54:38 类别：脚本大全浏览量：230

python网络爬虫案例实战

python爬取cnvd漏洞库信息的实例

今天一同事需要整理http://ics.cnvd.org.cn/工控漏洞库里面的信息，一看960多个要整理到什么时候才结束。

所以我决定写个爬虫帮他抓取数据。

看了一下各类信息还是很规则的，感觉应该很好写。

but这个网站设置了各种反爬虫手段。

经过各种百度，还是解决问题了。

设计思路：

1.先抓取每一个漏洞信息对应的网页url

2.获取每个页面的漏洞信息

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37 # -*- coding: utf-8 -*-

import requests

import re

import xlwt

import time

from bs4 import beautifulsoup

headers = {

'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',

'accept-encoding': 'gzip, deflate, sdch',

'accept-language': 'zh-cn,zh;q=0.8',

'user-agent': 'mozilla/5.0 (windows nt 10.0; wow64) applewebkit/537.36 (khtml, like gecko) chrome/54.0.2840.71 safari/537.36'

}

cookies={'__jsluid':'8d3f4c75f437ca82cdfad85c0f4f7c25'}

myfile=xlwt.workbook()

wtable=myfile.add_sheet(u"信息",cell_overwrite_ok=true)

j = 0

a = 900

for i in range(4):

url ="http://ics.cnvd.org.cn/?max=20&offset="+str(a)

r = requests.get(urttp://ics.cnvd.org.cnl,headers=headers,cookies=cookies)

print r.status_code

while r.status_code != 200:

r = requests.get(url,headers=headers,cookies=cookies)

print r.status_code

html = r.text

soup = beautifulsoup(html)

#print html

for tag in soup.find('tbody',id='tr').find_all('a',href=re.compile('http://www.cnvd.org.cn/flaw/show')):

print tag.attrs['href']

wtable.write(j,0,tag.attrs['href'])

j += 1

a += 20

print u"已完成%s"%(a)

filename=str(time.strftime('%y%m%d%h%m%s',time.localtime()))+"url.xls"

myfile.save(filename)

print u"完成%s的url备份"%time.strftime('%y%m%d%h%m%s',time.localtime())

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81 # -*- coding: utf-8 -*-

from selenium import webdriver

import xlrd

import xlwt

from selenium.webdriver.common.by import by

from selenium.webdriver.common.keys import keys

from selenium.webdriver.support.ui import select

from selenium.common.exceptions import nosuchelementexception

from selenium.common.exceptions import noalertpresentexception

import unittest, time, re

class gk(unittest.testcase):

def setup(self):

self.driver = webdriver.firefox()

self.driver.implicitly_wait(5)

self.verificationerrors = []

self.accept_next_alert = true

def test_gk(self):

myfile=xlwt.workbook()

wtable=myfile.add_sheet(u"info",cell_overwrite_ok=true)

data = xlrd.open_workbook('url.xlsx')

table = data.sheets()[0]

nrows = table.nrows

driver = self.driver

j = 0

for i in range(nrows):

try:

s = []

driver.get(table.cell(i,0).value)

title = driver.find_element_by_xpath("//h1").text

print title

s.append(title)

trs = driver.find_element_by_xpath("//tbody").find_elements_by_tag_name('tr')

for td in trs:

tds = td.find_elements_by_tag_name("td")

for tt in tds:

print tt.text

s.append(tt.text)

k = 0

for info in s:

wtable.write(j,k,info)

k += 1

j += 1

except:

filename=str(time.strftime('%y%m%d%h%m%s',time.localtime()))+"url.xls"

myfile.save(filename)

print u"异常自动保存%s的漏洞信息备份"%time.strftime('%y%m%d%h%m%s',time.localtime())

filename=str(time.strftime('%y%m%d%h%m%s',time.localtime()))+"url.xls"

myfile.save(filename)

print u"完成%s的漏洞信息备份"%time.strftime('%y%m%d%h%m%s',time.localtime())

def is_element_present(self, how, what):

try: self.driver.find_element(by=how, value=what)

except nosuchelementexception, e: return false

return true

def is_alert_present(self):

try: self.driver.switch_to_alert()

except noalertpresentexception, e: return false

return true

def close_alert_and_get_its_text(self):

try:

alert = self.driver.switch_to_alert()

alert_text = alert.text

if self.accept_next_alert:

alert.accept()

else:

alert.dismiss()

return alert_text

finally: self.accept_next_alert = true

def teardown(self):

self.driver.quit()

self.assertequal([], self.verificationerrors)

if __name__ == "__main__":

unittest.main()

好了。看看结果怎样！

python网络爬虫案例实战（python爬取cnvd漏洞库信息的实例）

ok！剩下手动整理一下，收工！

以上这篇python爬取cnvd漏洞库信息的实例就是小编分享给大家的全部内容了，希望能给大家一个参考，也希望大家多多支持开心学习网。

原文链接：https://blog.csdn.net/qq1124794084/article/details/53923897

标签：Python 漏洞爬取 cnvd

上一篇：sql查询union怎么用（SQL语句之Union和Union All的用法）

下一篇：nginx 配置https（Nginx下配置Https证书详细过程）

您可能感兴趣

python网络爬虫案例实战（python爬取cnvd漏洞库信息的实例）

python网络爬虫案例实战

热门推荐

排行榜