-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawling.py
176 lines (149 loc) · 6.4 KB
/
crawling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# Crawling 결과 반환
from flask import request, jsonify
from flask_restx import Resource, Api, Namespace
import json
from flask import make_response
import requests
from bs4 import BeautifulSoup
from urllib.parse import quote_plus
from selenium import webdriver
# -*- coding: cp949 -*-
from krwordrank.sentence import summarize_with_sentences
from krwordrank.word import KRWordRank
from newspaper import Article
from konlpy.tag import Kkma
from konlpy.tag import Twitter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
import numpy as np
# todo: install krwordrank
from data_list import get_stopwords
# 이 페이지의 이름이 Crawling이라고 정의해주는 것.
Crawling = Namespace('Crawling')
# 3.35.255.25/crawling/<keyword>/<size> 주소가 들어왔을 때 실행
@Crawling.route('/<string:keyword>/<int:size>')
class Search(Resource):
def get(self, keyword, size):
# crawling
keywords=[] # 검색어 리스트
keywords.append(keyword)
search_filtering = ' -유튜브, -아프리카티비, -지마켓, -옥션, -쇼핑몰, -TV'
search_words = ['', ' 사전', ' 효능', ' 부작용'] # 검색어 option list
result = []
for keyword in keywords:
for search_word in search_words:
final_search_word = keyword + search_word+search_filtering
result+=self.crawling(final_search_word, size)
# tokenize
url = result # url list
sentences = []
sent_tokenize = SentenceTokenizer()
for i in range(0, len(url)):
sentences += (sent_tokenize.url2sentences(url[i]))
print(sentences)
nouns = sent_tokenize.get_nouns(sentences)
print(nouns)
# train KR-WordRank model
wordrank_extractor = KRWordRank(
min_count=5, # 단어의 최소 출현 빈도수 (그래프 생성 시)
max_length=10, # 단어의 최대 길이
verbose=True
)
beta = 0.85 # PageRank의 decaying factor beta
max_iter = 10
keywords, rank, graph = wordrank_extractor.extract(sentences, beta, max_iter)
stopwords = get_stopwords()
# Check top 30 keywords with corresponding score
for word, r in sorted(keywords.items(), key=lambda x: -x[1])[:30]:
if not (word in stopwords):
print('%8s : %.4f' % (word, r))
# 핵심 문장 추출하기
# remove stopwords
passwords = {word: score for word, score in sorted(
keywords.items(), key=lambda x: -x[1])[:300] if not (word in stopwords)}
print('num passwords = {}'.format(len(passwords)))
penalty = lambda x: 0 if (25 <= len(x) <= 80 and not '마지막' in x) else 1,
keywords, sents = summarize_with_sentences(
sentences,
penalty=penalty,
stopwords=stopwords,
diversity=0.5,
num_keywords=100,
num_keysents=10,
verbose=False
)
for i in sents:
print(i)
# 결과 json으로 출력
message = {
'status': 200,
'message': 'OK',
'content': sents
}
resp = jsonify(message)
resp.status_code = 200
return resp
# crawling method: output is link list
def crawling(self, keyword, size):
baseUrl = 'https://www.google.com/search?q='
url = baseUrl + quote_plus(keyword)
# todo: webdriver option 설정
path = '/home/ubuntu/FlaskServer/chromedriver' # chromedriver.exe /home/ubuntu/FlaskServer/chromedriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('headless')
chrome_options.add_argument("--no-sandbox") # GUI를 사용할 수 없는 환경에서 설정, linux, docker 등
chrome_options.add_argument("--disable-gpu") # GUI를 사용할 수 없는 환경에서 설정, linux, docker 등
chrome_options.add_argument('lang=ko_KR')
driver = webdriver.Chrome(path,options=chrome_options)
driver.get(url)
# todo: 읽어온 페이지의 html 코드 받아오기
html = driver.page_source
soup = BeautifulSoup(html, "lxml")
link_result = {} # 검색 결과의 리스트 저장, key=문서이름, value=문서링크
link_result_2 = [] # 리스트만
lists = soup.select('#rso > .g')
# 가져올 사이트 개수 조정
cur_doc_count = 0
# todo: 검색 결과의 리스트 저장
for list in lists:
name = list.select_one('.LC20lb.DKV0Md').text
link = list.a.attrs['href']
link_result[name] = link
link_result_2.append(link)
cur_doc_count += 1
# 가져올 사이트 개수 조정
if cur_doc_count >= size:
break
print(link_result_2)
return link_result_2
class SentenceTokenizer(object):
def __init__(self):
self.kkma = Kkma()
self.twitter = Twitter()
self.stopwords = ['중인' ,'만큼', '마찬가지', '꼬집었', "연합뉴스", "데일리", "동아일보", "중앙일보", "조선일보", "기자"
,"아", "휴", "아이구", "아이쿠", "아이고", "어", "나", "우리", "저희", "따라", "의해", "을", "를", "에", "의", "가",]
def url2sentences(self, url):
article = Article(url, language='ko')
article.download()
article.parse()
sentences = self.kkma.sentences(article.text)
for idx in range(0, len(sentences)):
if len(sentences[idx]) <= 10:
sentences[idx-1] += (' ' + sentences[idx])
sentences[idx] = ''
return sentences
def text2sentences(self, text):
sentences = self.kkma.sentences(text)
for idx in range(0, len(sentences)):
if len(sentences[idx]) <= 10:
sentences[idx-1] += (' ' + sentences[idx])
sentences[idx] = ''
return sentences
def get_nouns(self, sentences):
nouns = []
for sentence in sentences:
if sentence is not '': # SyntaxWarning: "is not" with a literal. Did you mean "!="?
nouns.append(' '.join([noun for noun in self.twitter.nouns(str(sentence))
if noun not in self.stopwords and len(noun) > 1]))
return nouns