Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
Python-100-Days
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
huangkq
Python-100-Days
Commits
452b6f14
Commit
452b6f14
authored
May 28, 2018
by
jackfrued
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
更新了爬虫第1天代码
parent
402e0564
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
137 additions
and
0 deletions
+137
-0
example01.py
Day66-75/code/example01.py
+60
-0
example02.py
Day66-75/code/example02.py
+50
-0
example03.py
Day66-75/code/example03.py
+27
-0
No files found.
Day66-75/code/example01.py
0 → 100644
View file @
452b6f14
from
urllib.error
import
URLError
from
urllib.request
import
urlopen
import
re
import
pymysql
def
get_page_code
(
start_url
,
*
,
retry_times
=
3
,
charsets
=
(
'utf-8'
,
)):
try
:
for
charset
in
charsets
:
try
:
html
=
urlopen
(
start_url
)
.
read
()
.
decode
(
charset
)
break
except
UnicodeDecodeError
:
html
=
None
except
URLError
as
ex
:
print
(
'Error:'
,
ex
)
return
get_page_code
(
start_url
,
retry_times
=
retry_times
-
1
,
charsets
=
charsets
)
if
\
retry_times
>
0
else
None
return
html
def
main
():
url_list
=
[
'http://sports.sohu.com/nba_a.shtml'
]
visited_list
=
set
({})
while
len
(
url_list
)
>
0
:
current_url
=
url_list
.
pop
(
0
)
visited_list
.
add
(
current_url
)
print
(
current_url
)
html
=
get_page_code
(
current_url
,
charsets
=
(
'utf-8'
,
'gbk'
,
'gb2312'
))
if
html
:
link_regex
=
re
.
compile
(
r'<a[^>]+href=["\'](.*?)["\']'
,
re
.
IGNORECASE
)
link_list
=
re
.
findall
(
link_regex
,
html
)
url_list
+=
link_list
conn
=
pymysql
.
connect
(
host
=
'localhost'
,
port
=
3306
,
db
=
'crawler'
,
user
=
'root'
,
passwd
=
'123456'
,
charset
=
'utf8'
)
try
:
for
link
in
link_list
:
if
link
not
in
visited_list
:
visited_list
.
add
(
link
)
print
(
link
)
html
=
get_page_code
(
link
,
charsets
=
(
'utf-8'
,
'gbk'
,
'gb2312'
))
if
html
:
title_regex
=
re
.
compile
(
r'<h1>(.*)<span'
,
re
.
IGNORECASE
)
match_list
=
title_regex
.
findall
(
html
)
if
len
(
match_list
)
>
0
:
title
=
match_list
[
0
]
with
conn
.
cursor
()
as
cursor
:
cursor
.
execute
(
'insert into tb_result (rtitle, rurl) values (
%
s,
%
s)'
,
(
title
,
link
))
conn
.
commit
()
finally
:
conn
.
close
()
print
(
'执行完成!'
)
if
__name__
==
'__main__'
:
main
()
Day66-75/code/example02.py
0 → 100644
View file @
452b6f14
from
bs4
import
BeautifulSoup
import
re
def
main
():
html
=
"""
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>首页</title>
</head>
<body>
<h1>Hello, world!</h1>
<p>Good!!!</p>
<hr>
<div>
<h2>这是一个例子程序</h2>
<p>静夜思</p>
<p class="foo">床前明月光</p>
<p id="bar">疑似地上霜</p>
<p class="foo">举头望明月</p>
<div><a href="http://www.baidu.com"><p>低头思故乡</p></a></div>
</div>
<a class="foo" href="http://www.qq.com">腾讯网</a>
<img src="./img/pretty-girl.png" alt="美女">
<img src="./img/hellokitty.png" alt="凯蒂猫">
<img src="./static/img/pretty-girl.png" alt="美女">
<goup>Hello, Goup!</goup>
</body>
</html>
"""
# resp = requests.get('http://sports.sohu.com/nba_a.shtml')
# html = resp.content.decode('gbk')
soup
=
BeautifulSoup
(
html
,
'lxml'
)
print
(
soup
.
title
)
# JavaScript: document.body.h1
# JavaScript: document.forms[0]
print
(
soup
.
body
.
h1
)
print
(
soup
.
find_all
(
re
.
compile
(
r'p$'
)))
print
(
soup
.
find_all
(
'img'
,
{
'src'
:
re
.
compile
(
r'\./img/\w+.png'
)}))
print
(
soup
.
find_all
(
lambda
x
:
len
(
x
.
attrs
)
==
2
))
print
(
soup
.
find_all
(
'p'
,
{
'class'
:
'foo'
}))
for
elem
in
soup
.
select
(
'a[href]'
):
print
(
elem
.
attrs
[
'href'
])
if
__name__
==
'__main__'
:
main
()
Day66-75/code/example03.py
0 → 100644
View file @
452b6f14
from
bs4
import
BeautifulSoup
import
requests
import
re
def
main
():
# 通过requests第三方库的get方法获取页面
resp
=
requests
.
get
(
'http://sports.sohu.com/nba_a.shtml'
)
# 对响应的字节串(bytes)进行解码操作(搜狐的部分页面使用了GBK编码)
html
=
resp
.
content
.
decode
(
'gbk'
)
# 创建BeautifulSoup对象来解析页面(相当于JavaScript的DOM)
bs
=
BeautifulSoup
(
html
,
'lxml'
)
# 通过CSS选择器语法查找元素并通过循环进行处理
# for elem in bs.find_all(lambda x: 'test' in x.attrs):
for
elem
in
bs
.
select
(
'a[test]'
):
# 通过attrs属性(字典)获取元素的属性值
link_url
=
elem
.
attrs
[
'href'
]
resp
=
requests
.
get
(
link_url
)
bs_sub
=
BeautifulSoup
(
resp
.
text
,
'lxml'
)
# 使用正则表达式对获取的数据做进一步的处理
print
(
re
.
sub
(
r'[\r\n]'
,
''
,
bs_sub
.
find
(
'h1'
)
.
text
))
if
__name__
==
'__main__'
:
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment