收集整个网站数据

收集整个网站的数据

代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2016/11/15 下午3:06
# @Author : Dwk
# @File : wikiLink.py

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import pymysql.cursors

# 请求URL并把结果使用UTF-8编码
resp = urlopen("https://en.wikipedia.org/wiki/Main_Page").read().decode("utf-8")

# 使用BeautifulSoup解析
soup = BeautifulSoup(resp, "html.parser")

# 获取所有以/wiki/开头的a标签
urls = soup.find_all("a", href=re.compile("^/wiki/"))

# 输出词条对应的名称和URL
for url in urls:
# 过滤以.jpg或者.JPG结尾的链接
if not re.search("\.(jpg|JPG)$", url["href"]):
# 输出url的文字和对应的链接
print('{0:40} --> {1:10}'.format(url.get_text(), "https://en.wikipedia.org/wiki/" + url["href"]))
# 获取数据库链接
connection = pymysql.connect(host='localhost',
user='root',
password='0715',
db='wikiUrls',
charset='utf8mb4')

try:
# 获取会话指针
with connection.cursor() as cursor:
# 创建sql语句
sql = "insert into `urls`(`urlName`, `urlHref`) values (%s, %s)"
# 执行sql语句
cursor.execute(sql, (url.get_text(), "https://en.wikipedia.org/wiki/" + url["href"]))
# 提交
connection.commit()
finally:
connection.close()

下面是操作已保存的数据库的代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2016/11/15 下午4:35
# @Author : Dwk
# @File : readMysql.py

import pymysql.cursors

# 获取链接
connection = pymysql.connect(host='localhost',
user='root',
password='0715',
db='wikiUrls',
charset='utf8mb4')
try:
# 获取会话指针
with connection.cursor() as cursor:
# 查询语句数量
sql = "select `urlName`,`urlHref` FROM `urls` where `id` is not null"
count = cursor.execute(sql)
print(count)
# 查询数据
result = cursor.fetchmany(size=3)
print(result)
result = cursor.fetchall()
print(result)
finally:
connection.close()