Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
Python-100-Days
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
huangkq
Python-100-Days
Commits
25830d37
Commit
25830d37
authored
Jun 02, 2018
by
jackfrued
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
更新了爬虫部分的内容
parent
79f297a4
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
331 additions
and
5 deletions
+331
-5
02.数据采集和解析.md
Day66-75/02.数据采集和解析.md
+3
-2
03.存储数据.md
Day66-75/03.存储数据.md
+10
-2
example10.py
Day66-75/code/example10.py
+33
-0
main.py
Day66-75/code/main.py
+132
-0
main_redis.py
Day66-75/code/main_redis.py
+150
-0
玩转PyCharm(上).md
玩转PyCharm(上).md
+3
-1
No files found.
Day66-75/02.数据采集和解析.md
View file @
25830d37
...
...
@@ -5,8 +5,9 @@
1.
下载数据 - urllib / requests / aiohttp。
2.
解析数据 - re / lxml / beautifulsoup4(bs4)/ pyquery。
3.
缓存和持久化 - pymysql / redis / sqlalchemy / peewee / pymongo。
4.
序列化和压缩 - pickle / json / zlib。
5.
调度器 - 进程 / 线程 / 协程。
4.
生成摘要 - hashlib。
5.
序列化和压缩 - pickle / json / zlib。
6.
调度器 - 进程 / 线程 / 协程。
### HTML页面分析
...
...
Day66-75/03.存储数据.md
View file @
25830d37
...
...
@@ -191,8 +191,14 @@ b'admin'
#### MongoDB简介
MongoDB是2009年问世的一个面向文档的数据库管理系统,由C++语言编写,旨在为Web应用提供可扩展的高性能数据存储解决方案。虽然在划分类别的时候后,MongoDB被认为是NoSQL的产品,但是它更像一个介于关系数据库和非关系数据库之间的产品,在非关系数据库中它功能最丰富,最像关系数据库。
MongoDB将数据存储为一个文档,一个文档由一系列的“键值对”组成,其文档类似于JSON对象。目前,MongoDB已经提供了对Windows、MacOS、Linux、Solaris等多个平台的支持,而且也提供了多种开发语言的驱动程序,Python当然是其中之一。
#### MongoDB的安装和配置
#### 使用MongoDB实现CRUD操作
...
...
@@ -226,13 +232,15 @@ def main():
# 创建BeautifulSoup对象并指定使用lxml作为解析器
soup = BeautifulSoup(resp.text, 'lxml')
href_regex = re.compile(r'^/question')
# 将URL处理成SHA1摘要(长度固定更简短)
hasher_proto = sha1()
# 查找所有href属性以/question打头的a标签
for a_tag in soup.find_all('a', {'href': href_regex}):
# 获取a标签的href属性值并组装完整的URL
href = a_tag.attrs['href']
full_url = urljoin(base_url, href)
#
将URL处理成SHA1摘要(长度固定更简短)
hasher =
sha1
()
#
传入URL生成SHA1摘要
hasher =
hasher_proto.copy
()
hasher.update(full_url.encode('utf-8'))
field_key = hasher.hexdigest()
# 如果Redis的键'zhihu'对应的hash数据类型中没有URL的摘要就访问页面并缓存
...
...
Day66-75/code/example10.py
0 → 100644
View file @
25830d37
import
requests
from
bs4
import
BeautifulSoup
# selenium是一个自动化测试工具
# 通过它可以模拟浏览器的行为来访问Web页面
from
selenium
import
webdriver
def
main
():
# 先下载chromedriver并且将可执行程序放到PATH环境变量路径下
# 创建谷歌Chrome浏览器内核
driver
=
webdriver
.
Chrome
()
# 通过浏览器内核加载页面(可以加载动态生成的内容)
driver
.
get
(
'https://www.taobao.com/markets/mm/mm2017'
)
# driver.page_source获得的页面包含了JavaScript动态创建的内容
soup
=
BeautifulSoup
(
driver
.
page_source
,
'lxml'
)
all_images
=
soup
.
select
(
'img[src]'
)
for
image
in
all_images
:
url
=
image
.
get
(
'src'
)
try
:
if
not
str
(
url
)
.
startswith
(
'http'
):
url
=
'http:'
+
url
filename
=
url
[
url
.
rfind
(
'/'
)
+
1
:]
print
(
filename
)
resp
=
requests
.
get
(
url
)
with
open
(
'c:/images/'
+
filename
,
'wb'
)
as
f
:
f
.
write
(
resp
.
content
)
except
OSError
:
print
(
filename
+
'下载失败!'
)
print
(
'图片下载完成!'
)
if
__name__
==
'__main__'
:
main
()
\ No newline at end of file
Day66-75/code/main.py
0 → 100644
View file @
25830d37
from
enum
import
Enum
,
unique
from
queue
import
Queue
from
random
import
random
from
threading
import
Thread
,
current_thread
from
time
import
sleep
from
urllib.parse
import
urlparse
import
requests
from
bs4
import
BeautifulSoup
@
unique
class
SpiderStatus
(
Enum
):
IDLE
=
0
WORKING
=
1
def
decode_page
(
page_bytes
,
charsets
=
(
'utf-8'
,)):
page_html
=
None
for
charset
in
charsets
:
try
:
page_html
=
page_bytes
.
decode
(
charset
)
break
except
UnicodeDecodeError
:
pass
return
page_html
class
Retry
(
object
):
def
__init__
(
self
,
*
,
retry_times
=
3
,
wait_secs
=
5
,
errors
=
(
Exception
,
)):
self
.
retry_times
=
retry_times
self
.
wait_secs
=
wait_secs
self
.
errors
=
errors
def
__call__
(
self
,
fn
):
def
wrapper
(
*
args
,
**
kwargs
):
for
_
in
range
(
self
.
retry_times
):
try
:
return
fn
(
*
args
,
**
kwargs
)
except
self
.
errors
as
e
:
print
(
e
)
sleep
((
random
()
+
1
)
*
self
.
wait_secs
)
return
None
return
wrapper
class
Spider
(
object
):
def
__init__
(
self
):
self
.
status
=
SpiderStatus
.
IDLE
@
Retry
()
def
fetch
(
self
,
current_url
,
*
,
charsets
=
(
'utf-8'
,
),
user_agent
=
None
,
proxies
=
None
):
thread_name
=
current_thread
()
.
name
print
(
f
'[{thread_name}]: {current_url}'
)
headers
=
{
'user-agent'
:
user_agent
}
if
user_agent
else
{}
resp
=
requests
.
get
(
current_url
,
headers
=
headers
,
proxies
=
proxies
)
return
decode_page
(
resp
.
content
,
charsets
)
\
if
resp
.
status_code
==
200
else
None
def
parse
(
self
,
html_page
,
*
,
domain
=
'm.sohu.com'
):
soup
=
BeautifulSoup
(
html_page
,
'lxml'
)
url_links
=
[]
for
a_tag
in
soup
.
body
.
select
(
'a[href]'
):
parser
=
urlparse
(
a_tag
.
attrs
[
'href'
])
scheme
=
parser
.
scheme
or
'http'
netloc
=
parser
.
netloc
or
domain
if
scheme
!=
'javascript'
and
netloc
==
domain
:
path
=
parser
.
path
query
=
'?'
+
parser
.
query
if
parser
.
query
else
''
full_url
=
f
'{scheme}://{netloc}{path}{query}'
if
full_url
not
in
visited_urls
:
url_links
.
append
(
full_url
)
return
url_links
def
extract
(
self
,
html_page
):
pass
def
store
(
self
,
data_dict
):
pass
class
SpiderThread
(
Thread
):
def
__init__
(
self
,
name
,
spider
,
tasks_queue
):
super
()
.
__init__
(
name
=
name
,
daemon
=
True
)
self
.
spider
=
spider
self
.
tasks_queue
=
tasks_queue
def
run
(
self
):
while
True
:
current_url
=
self
.
tasks_queue
.
get
()
visited_urls
.
add
(
current_url
)
self
.
spider
.
status
=
SpiderStatus
.
WORKING
html_page
=
self
.
spider
.
fetch
(
current_url
)
if
html_page
not
in
[
None
,
''
]:
url_links
=
self
.
spider
.
parse
(
html_page
)
for
url_link
in
url_links
:
self
.
tasks_queue
.
put
(
url_link
)
self
.
spider
.
status
=
SpiderStatus
.
IDLE
def
is_any_alive
(
spider_threads
):
return
any
([
spider_thread
.
spider
.
status
==
SpiderStatus
.
WORKING
for
spider_thread
in
spider_threads
])
visited_urls
=
set
()
def
main
():
task_queue
=
Queue
()
task_queue
.
put
(
'http://m.sohu.com/'
)
spider_threads
=
[
SpiderThread
(
'thread-
%
d'
%
i
,
Spider
(),
task_queue
)
for
i
in
range
(
10
)]
for
spider_thread
in
spider_threads
:
spider_thread
.
start
()
while
not
task_queue
.
empty
()
or
is_any_alive
(
spider_threads
):
pass
print
(
'Over!'
)
if
__name__
==
'__main__'
:
main
()
Day66-75/code/main_redis.py
0 → 100644
View file @
25830d37
import
pickle
import
zlib
from
enum
import
Enum
,
unique
from
hashlib
import
sha1
from
random
import
random
from
threading
import
Thread
,
current_thread
from
time
import
sleep
from
urllib.parse
import
urlparse
import
pymongo
import
redis
import
requests
from
bs4
import
BeautifulSoup
from
bson
import
Binary
@
unique
class
SpiderStatus
(
Enum
):
IDLE
=
0
WORKING
=
1
def
decode_page
(
page_bytes
,
charsets
=
(
'utf-8'
,)):
page_html
=
None
for
charset
in
charsets
:
try
:
page_html
=
page_bytes
.
decode
(
charset
)
break
except
UnicodeDecodeError
:
pass
return
page_html
class
Retry
(
object
):
def
__init__
(
self
,
*
,
retry_times
=
3
,
wait_secs
=
5
,
errors
=
(
Exception
,
)):
self
.
retry_times
=
retry_times
self
.
wait_secs
=
wait_secs
self
.
errors
=
errors
def
__call__
(
self
,
fn
):
def
wrapper
(
*
args
,
**
kwargs
):
for
_
in
range
(
self
.
retry_times
):
try
:
return
fn
(
*
args
,
**
kwargs
)
except
self
.
errors
as
e
:
print
(
e
)
sleep
((
random
()
+
1
)
*
self
.
wait_secs
)
return
None
return
wrapper
class
Spider
(
object
):
def
__init__
(
self
):
self
.
status
=
SpiderStatus
.
IDLE
@
Retry
()
def
fetch
(
self
,
current_url
,
*
,
charsets
=
(
'utf-8'
,
),
user_agent
=
None
,
proxies
=
None
):
thread_name
=
current_thread
()
.
name
print
(
f
'[{thread_name}]: {current_url}'
)
headers
=
{
'user-agent'
:
user_agent
}
if
user_agent
else
{}
resp
=
requests
.
get
(
current_url
,
headers
=
headers
,
proxies
=
proxies
)
return
decode_page
(
resp
.
content
,
charsets
)
\
if
resp
.
status_code
==
200
else
None
def
parse
(
self
,
html_page
,
*
,
domain
=
'm.sohu.com'
):
soup
=
BeautifulSoup
(
html_page
,
'lxml'
)
for
a_tag
in
soup
.
body
.
select
(
'a[href]'
):
parser
=
urlparse
(
a_tag
.
attrs
[
'href'
])
scheme
=
parser
.
scheme
or
'http'
netloc
=
parser
.
netloc
or
domain
if
scheme
!=
'javascript'
and
netloc
==
domain
:
path
=
parser
.
path
query
=
'?'
+
parser
.
query
if
parser
.
query
else
''
full_url
=
f
'{scheme}://{netloc}{path}{query}'
if
not
redis_client
.
sismember
(
'visited_urls'
,
full_url
):
redis_client
.
rpush
(
'm_sohu_task'
,
full_url
)
def
extract
(
self
,
html_page
):
pass
def
store
(
self
,
data_dict
):
pass
class
SpiderThread
(
Thread
):
def
__init__
(
self
,
name
,
spider
):
super
()
.
__init__
(
name
=
name
,
daemon
=
True
)
self
.
spider
=
spider
def
run
(
self
):
while
True
:
current_url
=
redis_client
.
lpop
(
'm_sohu_task'
)
while
not
current_url
:
current_url
=
redis_client
.
lpop
(
'm_sohu_task'
)
self
.
spider
.
status
=
SpiderStatus
.
WORKING
current_url
=
current_url
.
decode
(
'utf-8'
)
if
not
redis_client
.
sismember
(
'visited_urls'
,
current_url
):
redis_client
.
sadd
(
'visited_urls'
,
current_url
)
html_page
=
self
.
spider
.
fetch
(
current_url
)
if
html_page
not
in
[
None
,
''
]:
hasher
=
hasher_proto
.
copy
()
hasher
.
update
(
current_url
.
encode
(
'utf-8'
))
doc_id
=
hasher
.
hexdigest
()
if
not
sohu_data_coll
.
find_one
({
'_id'
:
doc_id
}):
sohu_data_coll
.
insert_one
({
'_id'
:
doc_id
,
'url'
:
current_url
,
'page'
:
Binary
(
zlib
.
compress
(
pickle
.
dumps
(
html_page
)))
})
self
.
spider
.
parse
(
html_page
)
self
.
spider
.
status
=
SpiderStatus
.
IDLE
def
is_any_alive
(
spider_threads
):
return
any
([
spider_thread
.
spider
.
status
==
SpiderStatus
.
WORKING
for
spider_thread
in
spider_threads
])
redis_client
=
redis
.
Redis
(
host
=
'120.77.222.217'
,
port
=
6379
,
password
=
'1qaz2wsx'
)
mongo_client
=
pymongo
.
MongoClient
(
host
=
'120.77.222.217'
,
port
=
27017
)
db
=
mongo_client
.
msohu
sohu_data_coll
=
db
.
webpages
hasher_proto
=
sha1
()
def
main
():
if
not
redis_client
.
exists
(
'm_sohu_task'
):
redis_client
.
rpush
(
'm_sohu_task'
,
'http://m.sohu.com/'
)
spider_threads
=
[
SpiderThread
(
'thread-
%
d'
%
i
,
Spider
())
for
i
in
range
(
10
)]
for
spider_thread
in
spider_threads
:
spider_thread
.
start
()
while
redis_client
.
exists
(
'm_sohu_task'
)
or
is_any_alive
(
spider_threads
):
pass
print
(
'Over!'
)
if
__name__
==
'__main__'
:
main
()
玩转PyCharm(上).md
View file @
25830d37
...
...
@@ -6,13 +6,15 @@ PyCharm是由JetBrains公司开发的提供给Python专业的开发者的一个
可以在
[
JetBrains公司的官方网站
](
)找到PyCharm的[下载链接](https://www.jetbrains.com/pycharm/download/),有两个可供下载的版本一个是社区版一个是专业版,社区版在[Apache许可证](https://zh.wikipedia.org/wiki/Apache%E8%AE%B8%E5%8F%AF%E8%AF%81)下发布,专业版在专用许可证下发布(需要购买授权下载后可试用30天),其拥有许多额外功能。安装PyCharm需要有JRE(Java运行时环境)的支持,如果没有可以在安装过程中选择在线下载安装。
> 说明:如果你是一名学生,希望购买PyCharm来使用,可以看看[教育优惠官方申请指南](https://sales.jetbrains.com/hc/zh-cn/articles/207154369)。
### 首次使用的设置
第一次使用PyCharm时,会有一个导入设置的向导,如果之前没有使用PyCharm或者没有保存过设置的就直接选择“Do not import settings”进入下一步即可。
![](
./res/pycharm-import-settings.png
)
专业版的PyCharm是需要激活的,
强烈建议为优秀的软件支付费用
,如果不用做商业用途,我们可以暂时选择试用30天或者使用社区版的PyCharm。
专业版的PyCharm是需要激活的,
**强烈建议为优秀的软件支付费用**
,如果不用做商业用途,我们可以暂时选择试用30天或者使用社区版的PyCharm。
![](
./res/pycharm-activate.png
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment