-
Notifications
You must be signed in to change notification settings - Fork 33
/
DataStudy.py
142 lines (131 loc) · 5.69 KB
/
DataStudy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#-*- coding:utf-8 -*-
import jieba
import urllib
import sys
from jieba import analyse
import os
import jieba.posseg as pseg
import numpy as np
import numpy
import jieba.analyse
from sklearn import feature_extraction
#############################################################################
#############################将单个摘要分别分词保存#######################
WordCast=open('Cnki_label _abstruct2.csv', 'r')#CnkiLabelAbstruct.txt
ReadLines=WordCast.readlines()
JiebaCast=open('BitCnkiNOLabelAbstractJieba.txt', 'w')
stopwords = {}.fromkeys([line.rstrip() for line in open('StopWords.txt')])
for ReadLine in ReadLines:
#print ReadLine[0]
AbstractPart = jieba.analyse.extract_tags(ReadLine, topK=50, withWeight=False, allowPOS=())#1原来的数据,2,提取前20个,3权重是否显示,4,允许输出的词性
segs = AbstractPart
segs = [word.encode('utf-8') for word in list(segs)]
segs = [word for word in list(segs) if word not in stopwords]
for seg in segs:
#if '\n' not in seg:
JiebaCast.write('%s\t'%seg)
# if len(segs)==1:
# JiebaCast.write('NA\n')
JiebaCast.write('\n')
JiebaCast.close()
##########################################数据测试部分##############################
########################################################
##########################生成文本向量################
WordCast=open('DictionaryText.txt', 'r')#导入总词典向量
ReadLines=WordCast.readlines()
LabelOnly1=open('OnlyTable.txt', 'r')#导入标签向量
LabelOnly=LabelOnly1.readlines()
JiebaWordCast=open('BitCnkiNOLabelAbstractJieba.txt', 'r')#导入每个摘要分词后的词语,用于接下来生成每个摘要的向量
JiebaReadLines=JiebaWordCast.readlines()
DictionaryWriteNumber=0
VectorEachAbstract=open('VectorEachAbstract\%s.txt'%DictionaryWriteNumber, 'w')#将每个向量保存起来
DictionaryNumber=0
DictionaryTranslate={'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7,'H':8}
for JiebaText in JiebaReadLines:
VectorAbstract = JiebaText.split("\t")
TextVector = numpy.zeros([len(ReadLines)+1])
for Vector in VectorAbstract:
VectorPar=0
for ReadLine in ReadLines:
ReadLine = ReadLine.strip('\n')
if ReadLine == Vector :
TextVector[VectorPar] = 1
break
VectorPar+=1
if 'A' in LabelOnly[DictionaryNumber]:
TextVector[len(ReadLines)] = DictionaryTranslate['A']
if 'B' in LabelOnly[DictionaryNumber]:
TextVector[len(ReadLines)] = DictionaryTranslate['B']
if 'C' in LabelOnly[DictionaryNumber]:
TextVector[len(ReadLines)] = DictionaryTranslate['C']
if 'D' in LabelOnly[DictionaryNumber]:
TextVector[len(ReadLines)] = DictionaryTranslate['D']
if 'E' in LabelOnly[DictionaryNumber]:
TextVector[len(ReadLines)] = DictionaryTranslate['E']
if 'F' in LabelOnly[DictionaryNumber]:
TextVector[len(ReadLines)] = DictionaryTranslate['F']
if 'G' in LabelOnly[DictionaryNumber]:
TextVector[len(ReadLines)] = DictionaryTranslate['G']
if 'H' in LabelOnly[DictionaryNumber]:
TextVector[len(ReadLines)] = DictionaryTranslate['H']
print DictionaryNumber
DictionaryNumber+=1
#if DictionaryNumber%500!=0:
for Vector1 in TextVector:
VectorEachAbstract.write('%s\t'%Vector1)
VectorEachAbstract.write('\n')
if DictionaryNumber%2000==0:
DictionaryWriteNumber+=1
VectorEachAbstract.close()
VectorEachAbstract = open('VectorEachAbstract\%s.txt'%DictionaryWriteNumber, 'w')
WordCast.close()
LabelOnly1.close()
JiebaWordCast.close()
VectorEachAbstract.close()
#################################################################
#################################################################
##########标签提取,A B C D E F G H#############################
################################################################
WordCast=open('Cnki_label_abstruct.txt', 'r')#CnkiLabelAbstruct.txt
ReadLines=WordCast.readlines()
LineNumpy=[]
first_ele = True
LabelSelect=[]
WriteLines=open('LabelOnly.txt','w')
JiebaDictionary=[]
for ReadLine in ReadLines:
## 去掉每行的换行符,"\n"
Data2 = ReadLine.strip('\n')
Data3=Data2.strip(' ')
LineNumpy = Data3.split("\t")
LabelPart=LineNumpy[0]
LineNumpy[0]=LabelPart[0:3]
LabelSelect.append(LineNumpy)
print LineNumpy[0]
WriteLines.write('%s\n'%LineNumpy[0])
########################################################
#############方法1:直接分词,直接对词语标注############
# WordCast=open('test5.txt', 'r')
# ReadLines=WordCast.read()
# DictionaryJieba=jieba.cut(ReadLines)
#print odom
#print ReadLine
#print Read_lines
#############################################################################
##########################方法2:用TF*IDF,他是降低维度的一种方式################
WordCast=open('Cnki_label _abstruct2.csv', 'r')#CnkiLabelAbstruct.txt
ReadLines=WordCast.read()
keywords = jieba.analyse.extract_tags(ReadLines, topK=10000, withWeight=False, allowPOS=())#1原来的数据,2,提取前20个,3权重是否显示,4,允许输出的词性
import codecs
stopwords = {}.fromkeys([line.rstrip() for line in open('StopWords.txt')])
print keywords
segs = keywords
segs = [word.encode('utf-8') for word in list(segs)]
segs = [word for word in list(segs) if word not in stopwords]
DictionaryText=[]
WriteLines=open('DictionaryText.txt','w')
for seg in segs:
WriteLines.write('%s\n'%seg)
DictionaryText.append(seg)
print seg
####WriteLines###########################################################################