-
Notifications
You must be signed in to change notification settings - Fork 12
/
findall.py
36 lines (29 loc) · 1.44 KB
/
findall.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
'''
Description: re.findall
Author: HCQ
Company(School): UCAS
Email: 1756260160@qq.com
Date: 2021-01-17 19:50:56
LastEditTime: 2021-01-17 20:58:49
FilePath: /Spider/package/4re正则表达式/re基础/findall.py
'''
# 下面代码块来自于: package/4re正则表达式/01爬取图片正则解析.py
import requests
import re
# =====得到page_text 方式1========================
# f = open("index.html","r",encoding='utf-8')
# page_text = f.read()
# # page_text = requests.get(url=url,headers=headers).text
# print(page_text)
# =====得到page_text 方式2========================
page_text = '''
<a class="recmd-left video" href="/article/123936592" rel="nofollow" target="_blank" onclick="_hmt.push(['_trackEvent','web-list-video','chick'])">
<img src="//qiubai-video-web.qiushibaike.com/ZHVL04ZEOPBIUZ9X_hd.jpg?imageView2/1/w/150/h/112" alt="因为一件事让我在学校">
<div class="recmd-tag">2:11</div>
</a>
'''
#使用聚焦爬虫将页面中所有的糗图进行解析/提取(想要的是括号里面的(.*?))============================================================
ex = '<img src="(.*?)" alt.*?'
img_src_list = re.findall(ex,page_text,re.S) # 想要的是括号里面的(.*?)============================================
print(img_src_list) # ['//qiubai-video-web.qiushibaike.com/ZHVL04ZEOPBIUZ9X_hd.jpg?imageView2/1/w/150/h/112']
print(img_src_list[0]) # //qiubai-video-web.qiushibaike.com/ZHVL04ZEOPBIUZ9X_hd.jpg?imageView2/1/w/150/h/112