Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Crawler #11

Closed
wants to merge 29 commits into from
Closed
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
241 changes: 227 additions & 14 deletions poetry.lock

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ chromadb = "^0.4.14"
llama-index = "^0.9.10"
transformers = "^4.35.2"
torch = "^2.1.1"
selenium = "^4.15.2"
beautifulsoup4 = "^4.12.2"
requests = "^2.31.0"


[tool.poetry.group.dev.dependencies]
Expand Down
Empty file added tryy/__init__.py
Empty file.
90 changes: 90 additions & 0 deletions tryy/base_class.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
from abc import ABC
from abc import abstractmethod
import hashlib

from bs4 import BeautifulSoup
import requests


class BaseCrawler(ABC):

def __init__(self,
title,
content,
category,
modified_date,
media,
url=None,
url_hash=None,
content_hash=None,
headers=None):
self.title = title
self.content = content
self.category = category
self.modified_date = self.get_modified_date(modified_date)
self.media = media
self.url = self.get_page(url, headers)
self.url_hash = url_hash if url_hash else self.generate_hash(
url) if url else None
self.content_hash = content_hash if content_hash else self.generate_hash(
content) if content else None
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This section is a little hard to read, use

if ...:
    ...
elif ...:
    ...
else:
    ...

here.

self.url_hash = url_hash if url_hash else self.generate_hash(
url) if url else None
self.content_hash = content_hash if content_hash else self.generate_hash(
content) if content else None
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe you accidently duplicated this?


@abstractmethod
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does the use of @abstractmethod meet your intention, as it requires all derived classes to implement the get_page function?

def get_page(self, url, headers):
try:
r = requests.get(url, headers)
r.encoding = 'UTF-8'
soup = BeautifulSoup(r.text, "html.parser")
if soup is None:
print("Soup object is None. Parsing failed.")
return soup
except requests.RequestException as e:
print(f"Error fetching page")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can print out the error, and don't use f-strings if the string is hard-coded.

return None
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Python implicitly returns None for us, so there's no need to explicitly return it.


def get_title(self, soup, title_sel):
title = soup.select_one(title_sel)
title = title.text
return title

def get_content(self, soup, content_sel, title):
content_sel = soup.select(content_sel)
article_content = []
content_str = ""
content_str += title
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is content_str used for? Remove it if unused.

for s in content_sel:
s = s.text.replace('\n', '')
article_content.append(s)
return article_content
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be better to use list comprehension.


def get_category(self, soup, category_sel):
category = soup.select(category_sel)
return category

def find_category(self, soup, type, class_):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

type is a build in function, consider rename it.

category = soup.find_all(type, class_=class_)
for c in category:
print(c.text(), " ")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please remove this, as it seems like a simple test that we won't need in a production scenario.

return category

def get_modified_date(self, date_text):
try:
date_text = date_text.strip()
if ":" in date_text and len(date_text.split(":")) == 3:
date_text = ':'.join(date_text.split(':')[:-1])
if '-' in date_text:
date_text = date_text.replace('-', '/')
if ' ' not in date_text:
date_text += " 00:00"
return date_text[:16]
except Exception as e:
print(f"Error getting modified date {e}")
return None
Copy link
Collaborator

@david20571015 david20571015 Jan 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Python implicitly returns None for us, so there's no need to explicitly return it.


def generate_hash(self, data):
result = hashlib.sha1(data.encode('utf-8'))
return result.hexdigest()
81 changes: 81 additions & 0 deletions tryy/cna.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# -!- coding: utf-8 -!-

from proto.news_pb2 import News
from tryy.base_class import BaseCrawler


def cna_crawler(size=30):

media = '中央社'
headers = {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be the member of base class because all crawlers use the same header.


headers = {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
}

article_list = []

temp_base = BaseCrawler(title=None,
content=None,
category=None,
modified_date=None,
media=None)
soup = temp_base.get_page('https://www.cna.com.tw/list/aall.aspx', headers)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Check whether soup is None.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I add it to the base_class like this:
def get_page(self, url, headers):
try:
r = requests.get(url, headers)
r.encoding = 'UTF-8'
soup = BeautifulSoup(r.text, "html.parser")
if soup is None:
print("Soup object is None. Parsing failed.")
return soup
except requests.RequestException as e:
print(f"Error fetching page")
return None

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I add it to the base_class like this: def get_page(self, url, headers): try: r = requests.get(url, headers) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") if soup is None: print("Soup object is None. Parsing failed.") return soup except requests.RequestException as e: print(f"Error fetching page") return None

FYI, you can make good use of code blocks to improve the readability.

sel = soup.find('ul', 'mainList imgModule', id='jsMainList').find_all('li')

urls = [s.find('a')['href'] for s in sel if s.find('a')]

news_count = 0
for url in urls:
instance = BaseCrawler("Title",
"Content",
"Category",
"Modified_Date",
"中央社",
url=url,
headers=headers)
try:
soup = instance.url

title_selector = 'div.centralContent h1'
title = instance.get_title(soup, title_selector)

modified_date = soup.find('div', class_='updatetime').text
modified_date = instance.get_modified_date(modified_date)

category_selector = 'div.breadcrumb a'
category = instance.get_category(soup, category_selector)
category = category[1].text

tag_links = soup.select('div.keywordTag a')
tags = [tag.get_text().replace('#', '') for tag in tag_links]

content = soup.find("div", class_="paragraph")
content_selector = 'p:lang(zh)'
content = instance.get_content(content, content_selector, title)

news_item = News()
news_item.title = title
news_item.content = content
news_item.category = category
news_item.modified_date = modified_date
news_item.media = media
news_item.tags.extend(tags)
news_item.url = url
news_item.url_hash = instance.url_hash
news_item.content_hash = instance.content_hash

yield news_item
news_count += 1

if news_count >= size:
break

except Exception as e:
print("中央社cna")
print(url)
print(e)
98 changes: 98 additions & 0 deletions tryy/cts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# -!- coding: utf-8 -!-

from tryy.base_class import Base


def cts_crawler(size=30):

media = "華視"
headers = {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
}

article_list = []

temp_base = Base(title=None,
content=None,
category=None,
modified_date=None,
media=None)
soup = temp_base.get_page('https://news.cts.com.tw/real/index.html',
headers)
sel = soup.select('div.newslist-container')
sel = sel[0].find_all('a', href=True)

urls = []
for s in sel:
urls.append(s['href'])
# print(urls)

news_count = 0
for url in urls:
instance = Base("Title",
"Content",
"Category",
"Modified_Date",
"華視",
url=url,
headers=headers)
try:
# print(url)
soup = instance.url
# print("success__getting__page")

title_selector = 'h1.artical-title'
title = instance.get_title(instance.url, title_selector)
# print("success__getting__title: ", title)

modified_date = soup.select("time.artical-time")
modified_date = modified_date[0].text
modified_date = instance.get_modified_date(modified_date)
# print("success__getting__modified_date: ", modified_date)

category_selector = 'div.item.menu-active'
category = instance.get_category(soup, category_selector)[-1].text
# print("success__getting__category: ", category)

tags = []
tags_span = soup.select("div.news-tag")[0].find_all('a')
for tag in tags_span:
tags.append(tag.text)
# print("success__getting__tags: ", tags)

content_selector = 'div.artical-content'
content = instance.get_content(soup, content_selector, title)
# print("success__getting__content: ", content)

# print("success__getting__url_hash: ", instance.url_hash)
# print("success__getting__cont_hash: ", instance.content_hash, "\n\n")

news_dict = {}
news_dict['title'] = title
news_dict['content'] = content
news_dict['category'] = category
news_dict['modified_date'] = modified_date
news_dict['media'] = media
news_dict['tags'] = tags
news_dict['url'] = url
news_dict['url_hash'] = instance.url_hash
news_dict['content_hash'] = instance.content_hash

# print(news_dict)
article_list.append(news_dict)

news_count += 1
if news_count >= size:
break

except Exception as e:
print("華視 cts")
print(url)
print(e)
continue

return article_list


# result = cts_crawler(size=1)
112 changes: 112 additions & 0 deletions tryy/setn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
# -!- coding: utf-8 -!-

from tryy.base_class import Base


def setn_crawler(size=10):

# print("in setn")

media = '三立'
headers = {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
}

article_list = []

view_all_link = "https://www.setn.com/ViewAll.aspx"
base = 'https://www.setn.com'

temp_base = Base(title=None,
content=None,
category=None,
modified_date=None,
media=None)

soup = temp_base.get_page(view_all_link, headers)
sel = soup.find_all('a', class_='gt')

urls = []
for s in sel:
u = s.get('href')
if u[0] == '/':
full_url = base + u
else:
full_url = u
urls.append(full_url)

article_count = 0
for url in urls:
instance = Base("Title",
"Content",
"Category",
"Modified_Date",
"三立",
url=url,
headers=headers)
try:
# print(url)
soup = instance.url
# print("success__getting__page")

title_selector = 'h1.news-title-3'
title = instance.get_title(soup, title_selector)
# print("success__getting__title: ", title)

try:
modified_date = soup.select_one('div.page-title-text')
modified_date = modified_date.text
except AttributeError:
modified_date = soup.find(class_='newsTime').time.text
modified_date = instance.get_modified_date(modified_date)
# print("success__getting__modified_date: ", modified_date)

category_selector = 'meta[property="article:section"]'
category = instance.get_category(soup,
category_selector)[0]['content']
# print("success__getting__category: ", category)

tags = []
tags = soup.find('meta', attrs={'name': 'news_keywords'
})['content'].split(',')
# print("success__getting__tags: ", tags)

content_selector = 'article p'
content = instance.get_content(soup, content_selector, title)
# print("success__getting__content: ", content)

# print("success__getting__url_hash: ", instance.url_hash)
# print("success__getting__cont_hash: ", instance.content_hash, "\n\n")

news_dict = {
'title': title,
'content': content,
'category': category,
'modified_date': modified_date,
'media': media,
'tags': tags,
'url': url,
'url_hash': instance.url_hash,
'content_hash': instance.content_hash
}

# print(news_dict)
article_list.append(news_dict)

article_count += 1

if article_count >= size:
break

except Exception as e:
print("三立 setn")
print(url)
print(e)
continue

return article_list


# print("import success")
# result = setn_crawler(size=1)
Loading