收集整个网站的数据

代码如下：

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2016/11/15 下午3:06
# @Author  : Dwk
# @File    : wikiLink.py

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import pymysql.cursors

# 请求URL并把结果使用UTF-8编码
resp = urlopen("https://en.wikipedia.org/wiki/Main_Page").read().decode("utf-8")

# 使用BeautifulSoup解析
soup = BeautifulSoup(resp, "html.parser")

# 获取所有以/wiki/开头的a标签
urls = soup.find_all("a", href=re.compile("^/wiki/"))

# 输出词条对应的名称和URL
for url in urls:
    # 过滤以.jpg或者.JPG结尾的链接
    if not re.search("\.(jpg|JPG)$", url["href"]):
        # 输出url的文字和对应的链接
        print('{0:40} --> {1:10}'.format(url.get_text(), "https://en.wikipedia.org/wiki/" + url["href"]))
        # 获取数据库链接
        connection = pymysql.connect(host='localhost',
                                     user='root',
                                     password='0715',
                                     db='wikiUrls',
                                     charset='utf8mb4')

        try:
            # 获取会话指针
            with connection.cursor() as cursor:
                # 创建sql语句
                sql = "insert into `urls`(`urlName`, `urlHref`) values (%s, %s)"
                # 执行sql语句
                cursor.execute(sql, (url.get_text(), "https://en.wikipedia.org/wiki/" + url["href"]))
                # 提交
                connection.commit()
        finally:
            connection.close()

下面是操作已保存的数据库的代码：

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2016/11/15 下午4:35
# @Author  : Dwk
# @File    : readMysql.py

import pymysql.cursors

# 获取链接
connection = pymysql.connect(host='localhost',
                             user='root',
                             password='0715',
                             db='wikiUrls',
                             charset='utf8mb4')
try:
    # 获取会话指针
    with connection.cursor() as cursor:
        # 查询语句数量
        sql = "select `urlName`,`urlHref` FROM  `urls` where `id` is not null"
        count = cursor.execute(sql)
        print(count)
        # 查询数据
        result = cursor.fetchmany(size=3)
        print(result)
        result = cursor.fetchall()
        print(result)
finally:
    connection.close()