-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Crawler #11
Crawler #11
Changes from 25 commits
e28c6fe
c37c740
7a7b719
09db9a5
26b4a2a
20901c9
86b60b8
7228272
774546f
1d1bdf2
23b4ea1
fec1304
d3f6f7a
88f9cd5
6a4ef55
23ee849
53b2c38
8784113
93b65b7
20d63d6
75d19ca
f1b7e49
3c714a6
74941fb
e067609
235fd19
d3e4a53
237a253
790a3c5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
from abc import ABC | ||
from abc import abstractmethod | ||
import hashlib | ||
|
||
from bs4 import BeautifulSoup | ||
import requests | ||
|
||
|
||
class BaseCrawler(ABC): | ||
|
||
def __init__(self, | ||
title, | ||
content, | ||
category, | ||
modified_date, | ||
media, | ||
url=None, | ||
url_hash=None, | ||
content_hash=None, | ||
headers=None): | ||
self.title = title | ||
self.content = content | ||
self.category = category | ||
self.modified_date = self.get_modified_date(modified_date) | ||
self.media = media | ||
self.url = self.get_page(url, headers) | ||
self.url_hash = url_hash if url_hash else self.generate_hash( | ||
url) if url else None | ||
self.content_hash = content_hash if content_hash else self.generate_hash( | ||
content) if content else None | ||
self.url_hash = url_hash if url_hash else self.generate_hash( | ||
url) if url else None | ||
self.content_hash = content_hash if content_hash else self.generate_hash( | ||
content) if content else None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe you accidently duplicated this? |
||
|
||
@abstractmethod | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does the use of |
||
def get_page(self, url, headers): | ||
try: | ||
r = requests.get(url, headers) | ||
r.encoding = 'UTF-8' | ||
soup = BeautifulSoup(r.text, "html.parser") | ||
if soup is None: | ||
print("Soup object is None. Parsing failed.") | ||
return soup | ||
except requests.RequestException as e: | ||
print(f"Error fetching page") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can print out the error, and don't use f-strings if the string is hard-coded. |
||
return None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Python implicitly returns None for us, so there's no need to explicitly return it. |
||
|
||
def get_title(self, soup, title_sel): | ||
title = soup.select_one(title_sel) | ||
title = title.text | ||
return title | ||
|
||
def get_content(self, soup, content_sel, title): | ||
content_sel = soup.select(content_sel) | ||
article_content = [] | ||
content_str = "" | ||
content_str += title | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is |
||
for s in content_sel: | ||
s = s.text.replace('\n', '') | ||
article_content.append(s) | ||
return article_content | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be better to use list comprehension. |
||
|
||
def get_category(self, soup, category_sel): | ||
category = soup.select(category_sel) | ||
return category | ||
|
||
def find_category(self, soup, type, class_): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
category = soup.find_all(type, class_=class_) | ||
for c in category: | ||
print(c.text(), " ") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please remove this, as it seems like a simple test that we won't need in a production scenario. |
||
return category | ||
|
||
def get_modified_date(self, date_text): | ||
try: | ||
date_text = date_text.strip() | ||
if ":" in date_text and len(date_text.split(":")) == 3: | ||
date_text = ':'.join(date_text.split(':')[:-1]) | ||
if '-' in date_text: | ||
date_text = date_text.replace('-', '/') | ||
if ' ' not in date_text: | ||
date_text += " 00:00" | ||
return date_text[:16] | ||
except Exception as e: | ||
print(f"Error getting modified date {e}") | ||
return None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Python implicitly returns None for us, so there's no need to explicitly return it. |
||
|
||
def generate_hash(self, data): | ||
result = hashlib.sha1(data.encode('utf-8')) | ||
return result.hexdigest() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
# -!- coding: utf-8 -!- | ||
|
||
from proto.news_pb2 import News | ||
from tryy.base_class import BaseCrawler | ||
|
||
|
||
def cna_crawler(size=30): | ||
|
||
media = '中央社' | ||
headers = { | ||
"User-Agent": | ||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36" | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should be the member of base class because all crawlers use the same header. |
||
|
||
headers = { | ||
"User-Agent": | ||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36" | ||
} | ||
|
||
article_list = [] | ||
|
||
temp_base = BaseCrawler(title=None, | ||
content=None, | ||
category=None, | ||
modified_date=None, | ||
media=None) | ||
soup = temp_base.get_page('https://www.cna.com.tw/list/aall.aspx', headers) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Check whether There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I add it to the base_class like this: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
FYI, you can make good use of code blocks to improve the readability. |
||
sel = soup.find('ul', 'mainList imgModule', id='jsMainList').find_all('li') | ||
|
||
urls = [s.find('a')['href'] for s in sel if s.find('a')] | ||
|
||
news_count = 0 | ||
for url in urls: | ||
instance = BaseCrawler("Title", | ||
"Content", | ||
"Category", | ||
"Modified_Date", | ||
"中央社", | ||
url=url, | ||
headers=headers) | ||
try: | ||
soup = instance.url | ||
|
||
title_selector = 'div.centralContent h1' | ||
title = instance.get_title(soup, title_selector) | ||
|
||
modified_date = soup.find('div', class_='updatetime').text | ||
modified_date = instance.get_modified_date(modified_date) | ||
|
||
category_selector = 'div.breadcrumb a' | ||
category = instance.get_category(soup, category_selector) | ||
category = category[1].text | ||
|
||
tag_links = soup.select('div.keywordTag a') | ||
tags = [tag.get_text().replace('#', '') for tag in tag_links] | ||
|
||
content = soup.find("div", class_="paragraph") | ||
content_selector = 'p:lang(zh)' | ||
content = instance.get_content(content, content_selector, title) | ||
|
||
news_item = News() | ||
news_item.title = title | ||
news_item.content = content | ||
news_item.category = category | ||
news_item.modified_date = modified_date | ||
news_item.media = media | ||
news_item.tags.extend(tags) | ||
news_item.url = url | ||
news_item.url_hash = instance.url_hash | ||
news_item.content_hash = instance.content_hash | ||
|
||
yield news_item | ||
news_count += 1 | ||
|
||
if news_count >= size: | ||
break | ||
|
||
except Exception as e: | ||
print("中央社cna") | ||
print(url) | ||
print(e) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
# -!- coding: utf-8 -!- | ||
|
||
from tryy.base_class import Base | ||
|
||
|
||
def cts_crawler(size=30): | ||
|
||
media = "華視" | ||
headers = { | ||
"User-Agent": | ||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36" | ||
} | ||
|
||
article_list = [] | ||
|
||
temp_base = Base(title=None, | ||
content=None, | ||
category=None, | ||
modified_date=None, | ||
media=None) | ||
soup = temp_base.get_page('https://news.cts.com.tw/real/index.html', | ||
headers) | ||
sel = soup.select('div.newslist-container') | ||
sel = sel[0].find_all('a', href=True) | ||
|
||
urls = [] | ||
for s in sel: | ||
urls.append(s['href']) | ||
# print(urls) | ||
|
||
news_count = 0 | ||
for url in urls: | ||
instance = Base("Title", | ||
"Content", | ||
"Category", | ||
"Modified_Date", | ||
"華視", | ||
url=url, | ||
headers=headers) | ||
try: | ||
# print(url) | ||
soup = instance.url | ||
# print("success__getting__page") | ||
|
||
title_selector = 'h1.artical-title' | ||
title = instance.get_title(instance.url, title_selector) | ||
# print("success__getting__title: ", title) | ||
|
||
modified_date = soup.select("time.artical-time") | ||
modified_date = modified_date[0].text | ||
modified_date = instance.get_modified_date(modified_date) | ||
# print("success__getting__modified_date: ", modified_date) | ||
|
||
category_selector = 'div.item.menu-active' | ||
category = instance.get_category(soup, category_selector)[-1].text | ||
# print("success__getting__category: ", category) | ||
|
||
tags = [] | ||
tags_span = soup.select("div.news-tag")[0].find_all('a') | ||
for tag in tags_span: | ||
tags.append(tag.text) | ||
# print("success__getting__tags: ", tags) | ||
|
||
content_selector = 'div.artical-content' | ||
content = instance.get_content(soup, content_selector, title) | ||
# print("success__getting__content: ", content) | ||
|
||
# print("success__getting__url_hash: ", instance.url_hash) | ||
# print("success__getting__cont_hash: ", instance.content_hash, "\n\n") | ||
|
||
news_dict = {} | ||
news_dict['title'] = title | ||
news_dict['content'] = content | ||
news_dict['category'] = category | ||
news_dict['modified_date'] = modified_date | ||
news_dict['media'] = media | ||
news_dict['tags'] = tags | ||
news_dict['url'] = url | ||
news_dict['url_hash'] = instance.url_hash | ||
news_dict['content_hash'] = instance.content_hash | ||
|
||
# print(news_dict) | ||
article_list.append(news_dict) | ||
|
||
news_count += 1 | ||
if news_count >= size: | ||
break | ||
|
||
except Exception as e: | ||
print("華視 cts") | ||
print(url) | ||
print(e) | ||
continue | ||
|
||
return article_list | ||
|
||
|
||
# result = cts_crawler(size=1) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
# -!- coding: utf-8 -!- | ||
|
||
from tryy.base_class import Base | ||
|
||
|
||
def setn_crawler(size=10): | ||
|
||
# print("in setn") | ||
|
||
media = '三立' | ||
headers = { | ||
"User-Agent": | ||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36" | ||
} | ||
|
||
article_list = [] | ||
|
||
view_all_link = "https://www.setn.com/ViewAll.aspx" | ||
base = 'https://www.setn.com' | ||
|
||
temp_base = Base(title=None, | ||
content=None, | ||
category=None, | ||
modified_date=None, | ||
media=None) | ||
|
||
soup = temp_base.get_page(view_all_link, headers) | ||
sel = soup.find_all('a', class_='gt') | ||
|
||
urls = [] | ||
for s in sel: | ||
u = s.get('href') | ||
if u[0] == '/': | ||
full_url = base + u | ||
else: | ||
full_url = u | ||
urls.append(full_url) | ||
|
||
article_count = 0 | ||
for url in urls: | ||
instance = Base("Title", | ||
"Content", | ||
"Category", | ||
"Modified_Date", | ||
"三立", | ||
url=url, | ||
headers=headers) | ||
try: | ||
# print(url) | ||
soup = instance.url | ||
# print("success__getting__page") | ||
|
||
title_selector = 'h1.news-title-3' | ||
title = instance.get_title(soup, title_selector) | ||
# print("success__getting__title: ", title) | ||
|
||
try: | ||
modified_date = soup.select_one('div.page-title-text') | ||
modified_date = modified_date.text | ||
except AttributeError: | ||
modified_date = soup.find(class_='newsTime').time.text | ||
modified_date = instance.get_modified_date(modified_date) | ||
# print("success__getting__modified_date: ", modified_date) | ||
|
||
category_selector = 'meta[property="article:section"]' | ||
category = instance.get_category(soup, | ||
category_selector)[0]['content'] | ||
# print("success__getting__category: ", category) | ||
|
||
tags = [] | ||
tags = soup.find('meta', attrs={'name': 'news_keywords' | ||
})['content'].split(',') | ||
# print("success__getting__tags: ", tags) | ||
|
||
content_selector = 'article p' | ||
content = instance.get_content(soup, content_selector, title) | ||
# print("success__getting__content: ", content) | ||
|
||
# print("success__getting__url_hash: ", instance.url_hash) | ||
# print("success__getting__cont_hash: ", instance.content_hash, "\n\n") | ||
|
||
news_dict = { | ||
'title': title, | ||
'content': content, | ||
'category': category, | ||
'modified_date': modified_date, | ||
'media': media, | ||
'tags': tags, | ||
'url': url, | ||
'url_hash': instance.url_hash, | ||
'content_hash': instance.content_hash | ||
} | ||
|
||
# print(news_dict) | ||
article_list.append(news_dict) | ||
|
||
article_count += 1 | ||
|
||
if article_count >= size: | ||
break | ||
|
||
except Exception as e: | ||
print("三立 setn") | ||
print(url) | ||
print(e) | ||
continue | ||
|
||
return article_list | ||
|
||
|
||
# print("import success") | ||
# result = setn_crawler(size=1) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This section is a little hard to read, use
here.