收集整个网站数据 Posted on 2016-10-31 收集整个网站的数据代码如下: 1234567891011121314151617181920212223242526272829303132333435363738394041424344#!/usr/bin/env python# -*- coding: utf-8 -*-# @Time : 2016/11/15 下午3:06# @Author : Dwk# @File : wikiLink.pyfrom urllib.request import urlopenfrom bs4 import BeautifulSoupimport reimport pymysql.cursors# 请求URL并把结果使用UTF-8编码resp = urlopen("https://en.wikipedia.org/wiki/Main_Page").read().decode("utf-8")# 使用BeautifulSoup解析soup = BeautifulSoup(resp, "html.parser")# 获取所有以/wiki/开头的a标签urls = soup.find_all("a", href=re.compile("^/wiki/"))# 输出词条对应的名称和URLfor url in urls: # 过滤以.jpg或者.JPG结尾的链接 if not re.search("\.(jpg|JPG)$", url["href"]): # 输出url的文字和对应的链接 print('{0:40} --> {1:10}'.format(url.get_text(), "https://en.wikipedia.org/wiki/" + url["href"])) # 获取数据库链接 connection = pymysql.connect(host='localhost', user='root', password='0715', db='wikiUrls', charset='utf8mb4') try: # 获取会话指针 with connection.cursor() as cursor: # 创建sql语句 sql = "insert into `urls`(`urlName`, `urlHref`) values (%s, %s)" # 执行sql语句 cursor.execute(sql, (url.get_text(), "https://en.wikipedia.org/wiki/" + url["href"])) # 提交 connection.commit() finally: connection.close() 下面是操作已保存的数据库的代码: 12345678910111213141516171819202122232425262728#!/usr/bin/env python# -*- coding: utf-8 -*-# @Time : 2016/11/15 下午4:35# @Author : Dwk# @File : readMysql.pyimport pymysql.cursors# 获取链接connection = pymysql.connect(host='localhost', user='root', password='0715', db='wikiUrls', charset='utf8mb4')try: # 获取会话指针 with connection.cursor() as cursor: # 查询语句数量 sql = "select `urlName`,`urlHref` FROM `urls` where `id` is not null" count = cursor.execute(sql) print(count) # 查询数据 result = cursor.fetchmany(size=3) print(result) result = cursor.fetchall() print(result)finally: connection.close()