Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
P
Python-100-Days
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
huangkq
Python-100-Days
Commits
b8725569
Commit
b8725569
authored
May 29, 2018
by
jackfrued
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
更新了爬虫第3天的代码
parent
8adc0315
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
109 additions
and
2 deletions
+109
-2
02.数据采集和解析.md
Day66-75/02.数据采集和解析.md
+1
-1
03.存储数据.md
Day66-75/03.存储数据.md
+58
-0
example05.py
Day66-75/code/example05.py
+1
-1
example06.py
Day66-75/code/example06.py
+49
-0
No files found.
Day66-75/02.数据采集和解析.md
View file @
b8725569
...
@@ -87,7 +87,7 @@
...
@@ -87,7 +87,7 @@
> 说明:更多内容可以参考BeautifulSoup的[官方文档]()。
> 说明:更多内容可以参考BeautifulSoup的[官方文档]()。
###
例子
- 获取知乎发现上的问题链接
###
实例
- 获取知乎发现上的问题链接
```
Python
```
Python
from urllib.parse import urljoin
from urllib.parse import urljoin
...
...
Day66-75/03.存储数据.md
View file @
b8725569
...
@@ -197,5 +197,63 @@ b'admin'
...
@@ -197,5 +197,63 @@ b'admin'
### 实例 - 缓存知乎发现上的链接和页面代码
```
Python
from hashlib import sha1
from urllib.parse import urljoin
import pickle
import re
import requests
import zlib
from bs4 import BeautifulSoup
from redis import Redis
def main():
# 指定种子页面
base_url = 'https://www.zhihu.com/'
seed_url = urljoin(base_url, 'explore')
# 创建Redis客户端
client = Redis(host='1.2.3.4', port=6379, password='1qaz2wsx')
# 设置用户代理(否则访问会被拒绝)
headers = {'user-agent': 'Baiduspider'}
# 通过requests模块发送GET请求并指定用户代理
resp = requests.get(seed_url, headers=headers)
# 创建BeautifulSoup对象并指定使用lxml作为解析器
soup = BeautifulSoup(resp.text, 'lxml')
href_regex = re.compile(r'^/question')
# 查找所有href属性以/question打头的a标签
for a_tag in soup.find_all('a', {'href': href_regex}):
# 获取a标签的href属性值并组装完整的URL
href = a_tag.attrs['href']
full_url = urljoin(base_url, href)
# 将URL处理成SHA1摘要(长度固定更简短)
hasher = sha1()
hasher.update(full_url.encode('utf-8'))
field_key = hasher.hexdigest()
# 如果Redis的键'zhihu'对应的hash数据类型中没有URL的摘要就访问页面并缓存
if not client.hexists('zhihu', field_key):
html_page = requests.get(full_url, headers=headers).text
# 对页面进行序列化和压缩操作
zipped_page = zlib.compress(pickle.dumps(html_page))
# 使用hash数据类型保存URL摘要及其对应的页面代码
client.hset('zhihu', field_key, zipped_page)
# 显示总共缓存了多少个页面
print('Total %d question pages found.' % client.hlen('zhihu'))
if __name__ == '__main__':
main()
```
Day66-75/code/example05.py
View file @
b8725569
...
@@ -50,7 +50,7 @@ def get_matched_parts(page_html, pattern_str, pattern_ignore_case=re.I):
...
@@ -50,7 +50,7 @@ def get_matched_parts(page_html, pattern_str, pattern_ignore_case=re.I):
# 开始执行爬虫程序
# 开始执行爬虫程序
def
start_crawl
(
seed_url
,
match_pattern
,
*
,
max_depth
=-
1
):
def
start_crawl
(
seed_url
,
match_pattern
,
*
,
max_depth
=-
1
):
client
=
redis
.
Redis
(
host
=
'1
20.77.222.217'
,
port
=
11223
,
password
=
'1qaz2wsx'
)
client
=
redis
.
Redis
(
host
=
'1
.2.3.4'
,
port
=
6379
,
password
=
'1qaz2wsx'
)
charsets
=
(
'utf-8'
,
'gbk'
,
'gb2312'
)
charsets
=
(
'utf-8'
,
'gbk'
,
'gb2312'
)
logging
.
info
(
'[Redis ping]'
,
client
.
ping
())
logging
.
info
(
'[Redis ping]'
,
client
.
ping
())
url_list
=
[
seed_url
]
url_list
=
[
seed_url
]
...
...
Day66-75/code/example06.py
0 → 100644
View file @
b8725569
from
hashlib
import
sha1
from
urllib.parse
import
urljoin
import
pickle
import
re
import
requests
import
zlib
from
bs4
import
BeautifulSoup
from
redis
import
Redis
def
main
():
# 指定种子页面
base_url
=
'https://www.zhihu.com/'
seed_url
=
urljoin
(
base_url
,
'explore'
)
# 创建Redis客户端
client
=
Redis
(
host
=
'1.2.3.4'
,
port
=
6379
,
password
=
'1qaz2wsx'
)
# 设置用户代理
headers
=
{
'user-agent'
:
'Baiduspider'
}
# 通过requests模块发送GET请求并指定用户代理
resp
=
requests
.
get
(
seed_url
,
headers
=
headers
)
# 创建BeautifulSoup对象并指定使用lxml作为解析器
soup
=
BeautifulSoup
(
resp
.
text
,
'lxml'
)
href_regex
=
re
.
compile
(
r'^/question'
)
# 查找所有href属性以/question打头的a标签
for
a_tag
in
soup
.
find_all
(
'a'
,
{
'href'
:
href_regex
}):
# 获取a标签的href属性值并组装完整的URL
href
=
a_tag
.
attrs
[
'href'
]
full_url
=
urljoin
(
base_url
,
href
)
# 将URL处理成SHA1摘要(长度固定更简短)
hasher
=
sha1
()
hasher
.
update
(
full_url
.
encode
(
'utf-8'
))
field_key
=
hasher
.
hexdigest
()
# 如果Redis的键'zhihu'对应的hash数据类型中没有URL的摘要就访问页面并缓存
if
not
client
.
hexists
(
'zhihu'
,
field_key
):
html_page
=
requests
.
get
(
full_url
,
headers
=
headers
)
.
text
# 对页面进行序列化和压缩操作
zipped_page
=
zlib
.
compress
(
pickle
.
dumps
(
html_page
))
# 使用hash数据类型保存URL摘要及其对应的页面代码
client
.
hset
(
'zhihu'
,
field_key
,
zipped_page
)
# 显示总共缓存了多少个页面
print
(
'Total
%
d question pages found.'
%
client
.
hlen
(
'zhihu'
))
if
__name__
==
'__main__'
:
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment