-
Notifications
You must be signed in to change notification settings - Fork 0
/
posts.rb
82 lines (71 loc) · 2.06 KB
/
posts.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# get_posts.rb
require 'nokogiri'
require "sqlite3"
def parse_tags_str(tags_str)
tags = []
tags_str ||= ''
tag = ''
tags_str.each_char do |c|
if c == '<'
tag = ''
elsif c == '>'
tags << tag
else
tag += c
end
end
return tags
end
def get_posts(lang)
# read large xml file
fname = "Posts.xml"
xml = Nokogiri::XML::Reader(File.open(fname))
# open database
db = SQLite3::Database.new("#{lang}_posts.db")
db.execute("CREATE TABLE IF NOT EXISTS posts (id integer, body text, owner_display_name varchar(255), creation_date text, last_edit_date text, tags text, score integer, favorite_count integer, word_count integer)")
# construct prepared statement, for efficiency
sql = "INSERT INTO posts (id, body, owner_display_name, creation_date, last_edit_date, tags, score, favorite_count) VALUES (?, ?, ?, ?, ?, ?, ?, ?)"
stmt = db.prepare(sql)
trans_size = 50000
post_count = 0
lang_count = 0
dirty = false
xml.each do |posts|
posts.each do |row|
next if row.node_type == 14 # TYPE_SIGNIFICANT_WHITESPACE
post_count += 1
puts "post_count: #{post_count}" if post_count % 1000000 == 0
id = row.attribute("Id").to_i
tags_str = row.attribute("Tags")
tags = parse_tags_str(tags_str)
#puts tags.inspect
if tags.include?(lang)
lang_count += 1
puts "#{lang}_count: #{lang_count}" if lang_count % trans_size == 0
#puts tags.inspect
if lang_count % trans_size == 1
db.transaction
dirty = true
end
body = row.attribute("Body")
owner = row.attribute("OwnerDisplayName")
creation = row.attribute("CreationDate")
edit = row.attribute("LastEditDate")
score = row.attribute("Score")
favorite_count = row.attribute("FavoriteCount")
favorite_count = 0 if favorite_count.nil?
stmt.execute(id, body, owner, creation, edit, tags_str, score, favorite_count)
if lang_count % trans_size == 0
db.commit
dirty = false
end
end
end
end
if dirty == true
db.commit
puts 'Final commit'
end
puts "post_count: #{post_count}"
puts "#{lang}_count: #{lang_count}"
end #get_posts