Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Crawler #11

Closed
wants to merge 29 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
241 changes: 227 additions & 14 deletions poetry.lock

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ chromadb = "^0.4.14"
llama-index = "^0.9.10"
transformers = "^4.35.2"
torch = "^2.1.1"
selenium = "^4.15.2"
beautifulsoup4 = "^4.12.2"
requests = "^2.31.0"


[tool.poetry.group.dev.dependencies]
Expand Down
Empty file.
86 changes: 86 additions & 0 deletions sync_crawler/crawler/base_crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from abc import ABC
from abc import abstractmethod
import hashlib

from bs4 import BeautifulSoup
import requests


class BaseCrawler(ABC):

def __init__(self,
title,
content,
category,
modified_date,
media,
url=None,
url_hash=None,
content_hash=None,
headers=None):
self.title = title
self.content = content
self.category = category
self.modified_date = self.get_modified_date(modified_date)
self.media = media
self.url = self.get_page(url, headers)
self.url_hash = self.generate_hash(url)
self.content_hash = self.generate_hash(content)

def get_page(self, url, timeout):
try:
r = requests.get(url, self.headers, timeout=timeout)
r.encoding = 'UTF-8'
soup = BeautifulSoup(r.text, "html.parser")
if soup is None:
raise ValueError("Soup object is None. Parsing failed.")
return soup
except requests.RequestException as e:
print(f"Error fetching page: {e}")
return None
except Exception as e:
print(f"Error: {e}")
raise

def get_title(self, soup, title_sel):
title = soup.select_one(title_sel)
title = title.text
return title

def get_content(self, soup, content_sel, title):
content_sel = soup.select(content_sel)
article_content = []
content_str = ""
content_str += title
for s in content_sel:
s = s.text.replace('\n', '')
article_content.append(s)
return article_content

def get_category(self, soup, category_sel):
category = soup.select(category_sel)
return category

def find_category(self, soup, type, class_):
category = soup.find_all(type, class_=class_)
for c in category:
print(c.text(), " ")
return category

def get_modified_date(self, date_text):
try:
date_text = date_text.strip()
if ":" in date_text and len(date_text.split(":")) == 3:
date_text = ':'.join(date_text.split(':')[:-1])
if '-' in date_text:
date_text = date_text.replace('-', '/')
if ' ' not in date_text:
date_text += " 00:00"
return date_text[:16]
except Exception as e:
print(f"Error getting modified date {e}")
return None

def generate_hash(self, data):
result = hashlib.sha1(data.encode('utf-8'))
return result.hexdigest()
36 changes: 36 additions & 0 deletions sync_crawler/crawler/cna_crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from proto.news_pb2 import News
from sync_crawler.crawler.base_crawler import BaseCrawler

url = 'https://www.cna.com.tw/list/aall.aspx'


class CnaCrawler(BaseCrawler):

def cna_urls(self, timeout: int):
soup = self.get_page(url, timeout)
sel = soup.find('ul', 'mainList imgModule',
id='jsMainList').find_all('li')
urls = [s.find('a')['href'] for s in sel if s.find('a')]
return urls

def cna_modified_date(self, soup):
modified_date = soup.find('div', class_='updatetime').text
modified_date = self.get_modified_date(modified_date)
return modified_date

def cna_category(self, soup):
category_selector = 'div.breadcrumb a'
category = self.get_category(soup, category_selector)
category = category[1].text
return category

def cna_tag(self, soup):
tag_links = soup.select('div.keywordTag a')
tags = [tag.get_text().replace('#', '') for tag in tag_links]
return tags

def cna_content(self, soup, title: str):
content = soup.find("div", class_="paragraph")
content_selector = 'p:lang(zh)'
content = self.get_content(content, content_selector, title)
return content
39 changes: 39 additions & 0 deletions sync_crawler/crawler/cts_crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from proto.news_pb2 import News
from sync_crawler.crawler.base_crawler import BaseCrawler

url = 'https://www.cna.com.tw/list/aall.aspx'


class CtsCrawler(BaseCrawler):

def cts_urls(self, timeout: int):
soup = self.get_page(url, timeout)
sel = soup.select('div.newslist-container')
sel = sel[0].find_all('a', href=True)
urls = []
for s in sel:
urls.append(s['href'])
return urls

def cts_modified_date(self, soup):
modified_date = soup.select("time.artical-time")
modified_date = modified_date[0].text
modified_date = self.get_modified_date(modified_date)
return modified_date

def cts_modified_date(self, soup):
category_selector = 'div.item.menu-active'
category = self.get_category(soup, category_selector)[-1].text
return category

def cts_tag(self, soup):
tags = []
tags_span = soup.select("div.news-tag")[0].find_all('a')
for tag in tags_span:
tags.append(tag.text)
return tags

def cts_content(self, soup, title: str):
content_selector = 'div.artical-content'
content = self.get_content(soup, content_selector, title)
return content