-
Notifications
You must be signed in to change notification settings - Fork 0
/
db_doc_text_saver_nolabel.py
157 lines (124 loc) · 5.52 KB
/
db_doc_text_saver_nolabel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# -*- coding: utf-8 -*-
"""
Copyright 2019-2021 Lummetry.AI (4E SOFTWARE SRL). All Rights Reserved.
* NOTICE: All information contained herein is, and remains the property of 4E SOFTWARE SRL.
* The intellectual and technical concepts contained herein are proprietary to 4E SOFTWARE SRL
* and may be covered by Romanian and Foreign Patents, patents in process, and are protected
* by trade secret or copyright law.
* Dissemination of this information or reproduction of this material is strictly forbidden
* unless prior written permission is obtained from 4E SOFTWARE SRL.
*
*
* RO:
* Modul software TempRent, proiect finanțat în cadrul POC, Axa prioritara 2 - Tehnologia Informației și Comunicațiilor (TIC)
* pentru o economie digitală competitivă, Prioritatea de investiții 2b - Dezvoltarea produselor și s
* erviciilor TIC, a comerțului electronic și a cererii de TIC, cod SMIS 142474,
* Contractul de finanțare nr. 2/221_ap3/24.06.2021.
*
RO:
Acest produs a fost livrat si realizat in baza serviciilor de cercetare-inovare industrială
conform contract de servicii nr. 9 din 01.11.2021 folosind modulele AI "ALLAN" aferente "TempRent" -
Proiect finanțat în cadrul POC, Axa prioritara 2 - Tehnologia Informației și Comunicațiilor (TIC)
pentru o economie digitală competitivă, Prioritatea de investiții 2b - Dezvoltarea produselor și s
erviciilor TIC, a comerțului electronic și a cererii de TIC, cod SMIS 142474,
Contractul de finanțare nr. 2/221_ap3/24.06.2021.
"""
import numpy as np
import sys
from collections import deque
import time
from libraries import Logger
from libraries.db_conn.odbc_conn import ODBCConnector
from utils.utils import raw_text_to_words, clean_words_list
if __name__ == '__main__':
how_often_to_save = 10000
how_often_to_report = 10
log = Logger(
lib_name='DBSV', base_folder='.', app_folder='_cache',
TF_KERAS=False
)
config = {
'CONNECT_PARAMS' : {
'DRIVER' : '{ODBC Driver 17 for SQL Server}',
'SERVER' : '195.60.78.50',
'PORT' : 1433,
'DATABASE' : 'LegeV',
'Uid' : 'damian',
'Pwd' : '4Esoft1234!@#$2021',
},
'QUERY_PARAMS' : None
}
#qry_docs = 'select TOP (10000) id from document'
qry_docs = 'select distinct id from document'
qry_txt = 'select continut from paragraf where id_document={}'
conn = ODBCConnector(log=log, verbose=False, config=config)
conn.connect(nr_retries=5)
df_docs = conn.get_data(sql_query=qry_docs)
lst_X_docs = []
lst_y_labels = []
unique_labels = set()
DEBUG = len(sys.argv) > 1 and sys.argv[1].upper() == 'DEBUG'
log.P("Running params: {}. Debug mode {}".format(sys.argv, "ON" if DEBUG else "OFF"))
n_iters = df_docs.shape[0]
timings = deque(maxlen=10)
global_lens = []
skipped = []
for idx_doc in range(n_iters):
if idx_doc < 7 * how_often_to_save:
continue
t0 = time.time()
id_doc = df_docs.iloc[idx_doc,0]
# process text
df_text = conn.get_data(sql_query=qry_txt.format(id_doc))
if df_text.shape[0] > 1000 or df_text.shape[0] < 4:
skipped.append([id_doc, df_text.shape[0]])
continue
lst_doc_txt = []
for idx_txt in range(df_text.shape[0]):
txt = df_text.iloc[idx_txt,0]
if txt != None:
lst_doc_txt.append(txt + "pastreaza_cuvant")
raw_doc_str = " ".join(lst_doc_txt)
doc_str = raw_text_to_words(raw_doc_str, max_len=15)
doc_str = list(map(lambda x: x.strip(), doc_str))
doc_str = " ".join(doc_str).replace("pastreaza_cuvant pastreaza_cuvant ", "\n").replace("pastreaza_cuvant ", "\n").replace(" pastreaza_cuvant", "\n").replace("pastreaza_cuvant", "\n")
if "pastreaza_cuvant" in doc_str:
print(doc_str)
print("pastreaza_cuvant still in text")
sys.exit()
lst_X_docs.append(doc_str)
lap_time = time.time() - t0
timings.append(lap_time)
mean_time = np.mean(timings)
remaining_time = (n_iters - (idx_doc + 1)) * mean_time
if (idx_doc % how_often_to_report) == 0:
print("\rProcessed {}/{} documents ({:.1f}%). Remaining time {:.0f}s/{} ({:.1f}s/doc\r".format(
idx_doc+1, n_iters,
(idx_doc+1) / df_docs.shape[0] * 100,
remaining_time,
time.strftime("%H:%M:%S", time.gmtime(remaining_time)),
mean_time
),
end='', flush=True)
if ((idx_doc + 1) % how_often_to_save) == 0:
with open("_cache/_data/pretraining_texts/texts{0}.txt".format((idx_doc + 1) // how_often_to_save), "w") as save_file:
for doc in lst_X_docs:
save_file.writelines(doc)
save_file.write("\n")
lens = [len(x) for x in lst_X_docs]
global_lens.extend(lens)
lst_X_docs = []
if len(lst_X_docs) > 0:
lens = [len(x) for x in lst_X_docs]
global_lens.extend(lens)
log.P("Obtained {} documents:".format(len(global_lens)))
log.show_text_histogram(lens, show_both_ends=True, caption='Words per document')
log.P("Hist:\n{}".format(np.histogram(global_lens)))
print(np.min(global_lens), np.mean(global_lens), np.mean(global_lens), np.max(global_lens), np.percentile(global_lens, 90))
s = list(map(lambda x: x[1], skipped))
print("len", len(s), np.min(s), np.mean(s), np.median(s), np.max(s))
if len(lst_X_docs) > 0:
with open("_cache/_data/pretraining_texts/texts{0}.txt".format((idx_doc + 1) // how_often_to_save), "w") as save_file:
for doc in lst_X_docs:
save_file.writelines(doc)
save_file.write("\n")