NCTU-SYNC · casssidyHong · Nov 24, 2023 · Dec 4, 2023 · Dec 4, 2023 · Dec 4, 2023
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -15,6 +15,9 @@ chromadb = "^0.4.14"
 llama-index = "^0.9.10"
 transformers = "^4.35.2"
 torch = "^2.1.1"
+selenium = "^4.15.2"
+beautifulsoup4 = "^4.12.2"
+requests = "^2.31.0"
 
 
 [tool.poetry.group.dev.dependencies]

diff --git a/tryy/__init__.py b/tryy/__init__.py
diff --git a/tryy/base_class.py b/tryy/base_class.py
@@ -0,0 +1,90 @@
+from abc import ABC
+from abc import abstractmethod
+import hashlib
+
+from bs4 import BeautifulSoup
+import requests
+
+
+class BaseCrawler(ABC):
+
+    def __init__(self,
+                 title,
+                 content,
+                 category,
+                 modified_date,
+                 media,
+                 url=None,
+                 url_hash=None,
+                 content_hash=None,
+                 headers=None):
+        self.title = title
+        self.content = content
+        self.category = category
+        self.modified_date = self.get_modified_date(modified_date)
+        self.media = media
+        self.url = self.get_page(url, headers)
+        self.url_hash = url_hash if url_hash else self.generate_hash(
+            url) if url else None
+        self.content_hash = content_hash if content_hash else self.generate_hash(
+            content) if content else None
+        self.url_hash = url_hash if url_hash else self.generate_hash(
+            url) if url else None
+        self.content_hash = content_hash if content_hash else self.generate_hash(
+            content) if content else None
+
+    @abstractmethod
+    def get_page(self, url, headers):
+        try:
+            r = requests.get(url, headers)
+            r.encoding = 'UTF-8'
+            soup = BeautifulSoup(r.text, "html.parser")
+            if soup is None:
+                print("Soup object is None. Parsing failed.")
+            return soup
+        except requests.RequestException as e:
+            print(f"Error fetching page")
+            return None
+
+    def get_title(self, soup, title_sel):
+        title = soup.select_one(title_sel)
+        title = title.text
+        return title
+
+    def get_content(self, soup, content_sel, title):
+        content_sel = soup.select(content_sel)
+        article_content = []
+        content_str = ""
+        content_str += title
+        for s in content_sel:
+            s = s.text.replace('\n', '')
+            article_content.append(s)
+        return article_content
+
+    def get_category(self, soup, category_sel):
+        category = soup.select(category_sel)
+        return category
+
+    def find_category(self, soup, type, class_):
+        category = soup.find_all(type, class_=class_)
+        for c in category:
+            print(c.text(), " ")
+        return category
+
+    def get_modified_date(self, date_text):
+        try:
+            date_text = date_text.strip()
+            if ":" in date_text and len(date_text.split(":")) == 3:
+                date_text = ':'.join(date_text.split(':')[:-1])
+            if '-' in date_text:
+                date_text = date_text.replace('-', '/')
+            if ' ' not in date_text:
+                date_text += " 00:00"
+            return date_text[:16]
+        except Exception as e:
+            print(f"Error getting modified date {e}")
+            return None
+
+    def generate_hash(self, data):
+        result = hashlib.sha1(data.encode('utf-8'))
+        return result.hexdigest()
diff --git a/tryy/cna.py b/tryy/cna.py
@@ -0,0 +1,81 @@
+# -!- coding: utf-8 -!-
+
+from proto.news_pb2 import News
+from tryy.base_class import BaseCrawler
+
+
+def cna_crawler(size=30):
+
+    media = '中央社'
+    headers = {
+        "User-Agent":
+            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
+    }
+
+    headers = {
+        "User-Agent":
+            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
+    }
+
+    article_list = []
+
+    temp_base = BaseCrawler(title=None,
+                            content=None,
+                            category=None,
+                            modified_date=None,
+                            media=None)
+    soup = temp_base.get_page('https://www.cna.com.tw/list/aall.aspx', headers)
+    sel = soup.find('ul', 'mainList imgModule', id='jsMainList').find_all('li')
+
+    urls = [s.find('a')['href'] for s in sel if s.find('a')]
+
+    news_count = 0
+    for url in urls:
+        instance = BaseCrawler("Title",
+                               "Content",
+                               "Category",
+                               "Modified_Date",
+                               "中央社",
+                               url=url,
+                               headers=headers)
+        try:
+            soup = instance.url
+
+            title_selector = 'div.centralContent h1'
+            title = instance.get_title(soup, title_selector)
+
+            modified_date = soup.find('div', class_='updatetime').text
+            modified_date = instance.get_modified_date(modified_date)
+
+            category_selector = 'div.breadcrumb a'
+            category = instance.get_category(soup, category_selector)
+            category = category[1].text
+
+            tag_links = soup.select('div.keywordTag a')
+            tags = [tag.get_text().replace('#', '') for tag in tag_links]
+
+            content = soup.find("div", class_="paragraph")
+            content_selector = 'p:lang(zh)'
+            content = instance.get_content(content, content_selector, title)
+
+            news_item = News()
+            news_item.title = title
+            news_item.content = content
+            news_item.category = category
+            news_item.modified_date = modified_date
+            news_item.media = media
+            news_item.tags.extend(tags)
+            news_item.url = url
+            news_item.url_hash = instance.url_hash
+            news_item.content_hash = instance.content_hash
+
+            yield news_item
+            news_count += 1
+
+            if news_count >= size:
+                break
+
+        except Exception as e:
+            print("中央社cna")
+            print(url)
+            print(e)
diff --git a/tryy/cts.py b/tryy/cts.py
@@ -0,0 +1,98 @@
+# -!- coding: utf-8 -!-
+
+from tryy.base_class import Base
+
+
+def cts_crawler(size=30):
+
+    media = "華視"
+    headers = {
+        "User-Agent":
+            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
+    }
+
+    article_list = []
+
+    temp_base = Base(title=None,
+                     content=None,
+                     category=None,
+                     modified_date=None,
+                     media=None)
+    soup = temp_base.get_page('https://news.cts.com.tw/real/index.html',
+                              headers)
+    sel = soup.select('div.newslist-container')
+    sel = sel[0].find_all('a', href=True)
+
+    urls = []
+    for s in sel:
+        urls.append(s['href'])
+    # print(urls)
+
+    news_count = 0
+    for url in urls:
+        instance = Base("Title",
+                        "Content",
+                        "Category",
+                        "Modified_Date",
+                        "華視",
+                        url=url,
+                        headers=headers)
+        try:
+            # print(url)
+            soup = instance.url
+            # print("success__getting__page")
+
+            title_selector = 'h1.artical-title'
+            title = instance.get_title(instance.url, title_selector)
+            # print("success__getting__title: ", title)
+
+            modified_date = soup.select("time.artical-time")
+            modified_date = modified_date[0].text
+            modified_date = instance.get_modified_date(modified_date)
+            # print("success__getting__modified_date: ", modified_date)
+
+            category_selector = 'div.item.menu-active'
+            category = instance.get_category(soup, category_selector)[-1].text
+            # print("success__getting__category: ", category)
+
+            tags = []
+            tags_span = soup.select("div.news-tag")[0].find_all('a')
+            for tag in tags_span:
+                tags.append(tag.text)
+            # print("success__getting__tags: ", tags)
+
+            content_selector = 'div.artical-content'
+            content = instance.get_content(soup, content_selector, title)
+            # print("success__getting__content: ", content)
+
+            # print("success__getting__url_hash: ", instance.url_hash)
+            # print("success__getting__cont_hash: ", instance.content_hash, "\n\n")
+
+            news_dict = {}
+            news_dict['title'] = title
+            news_dict['content'] = content
+            news_dict['category'] = category
+            news_dict['modified_date'] = modified_date
+            news_dict['media'] = media
+            news_dict['tags'] = tags
+            news_dict['url'] = url
+            news_dict['url_hash'] = instance.url_hash
+            news_dict['content_hash'] = instance.content_hash
+
+            # print(news_dict)
+            article_list.append(news_dict)
+
+            news_count += 1
+            if news_count >= size:
+                break
+
+        except Exception as e:
+            print("華視 cts")
+            print(url)
+            print(e)
+            continue
+
+    return article_list
+
+
+# result = cts_crawler(size=1)
diff --git a/tryy/setn.py b/tryy/setn.py
@@ -0,0 +1,112 @@
+# -!- coding: utf-8 -!-
+
+from tryy.base_class import Base
+
+
+def setn_crawler(size=10):
+
+    # print("in setn")
+
+    media = '三立'
+    headers = {
+        "User-Agent":
+            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
+    }
+
+    article_list = []
+
+    view_all_link = "https://www.setn.com/ViewAll.aspx"
+    base = 'https://www.setn.com'
+
+    temp_base = Base(title=None,
+                     content=None,
+                     category=None,
+                     modified_date=None,
+                     media=None)
+
+    soup = temp_base.get_page(view_all_link, headers)
+    sel = soup.find_all('a', class_='gt')
+
+    urls = []
+    for s in sel:
+        u = s.get('href')
+        if u[0] == '/':
+            full_url = base + u
+        else:
+            full_url = u
+        urls.append(full_url)
+
+    article_count = 0
+    for url in urls:
+        instance = Base("Title",
+                        "Content",
+                        "Category",
+                        "Modified_Date",
+                        "三立",
+                        url=url,
+                        headers=headers)
+        try:
+            # print(url)
+            soup = instance.url
+            # print("success__getting__page")
+
+            title_selector = 'h1.news-title-3'
+            title = instance.get_title(soup, title_selector)
+            # print("success__getting__title: ", title)
+
+            try:
+                modified_date = soup.select_one('div.page-title-text')
+                modified_date = modified_date.text
+            except AttributeError:
+                modified_date = soup.find(class_='newsTime').time.text
+            modified_date = instance.get_modified_date(modified_date)
+            # print("success__getting__modified_date: ", modified_date)
+
+            category_selector = 'meta[property="article:section"]'
+            category = instance.get_category(soup,
+                                             category_selector)[0]['content']
+            # print("success__getting__category: ", category)
+
+            tags = []
+            tags = soup.find('meta', attrs={'name': 'news_keywords'
+                                           })['content'].split(',')
+            # print("success__getting__tags: ", tags)
+
+            content_selector = 'article p'
+            content = instance.get_content(soup, content_selector, title)
+            # print("success__getting__content: ", content)
+
+            # print("success__getting__url_hash: ", instance.url_hash)
+            # print("success__getting__cont_hash: ", instance.content_hash, "\n\n")
+
+            news_dict = {
+                'title': title,
+                'content': content,
+                'category': category,
+                'modified_date': modified_date,
+                'media': media,
+                'tags': tags,
+                'url': url,
+                'url_hash': instance.url_hash,
+                'content_hash': instance.content_hash
+            }
+
+            # print(news_dict)
+            article_list.append(news_dict)
+
+            article_count += 1
+
+            if article_count >= size:
+                break
+
+        except Exception as e:
+            print("三立 setn")
+            print(url)
+            print(e)
+            continue
+
+    return article_list
+
+
+# print("import success")
+# result = setn_crawler(size=1)