-
Notifications
You must be signed in to change notification settings - Fork 0
/
analyze_politifact.R
139 lines (114 loc) · 4.86 KB
/
analyze_politifact.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
"
Aim:
Analyzing politifact data
Bias in Scrutiny, Batting Average by R and D
Author: Gaurav Sood
Date: 5/25/2016
"
# Global Opt
options(stringsAsFactors=FALSE)
# Base libs
library(goji)
# set dir
setwd(githubdir)
setwd("politifact")
# Load dat
library(curl)
pol_facts <- read.csv(curl("https://github.com/raw/soodoku/politifact/master/politifacts.csv"))
pol_names <- read.csv(curl("https://github.com/raw/soodoku/politifact/master/pol_names.csv"))
pol_all <- merge(pol_facts, pol_names, by.x="page_names", by.y="x", all.x=T, all.y=F)
# Subset on statements by orgs. whose partisanship we know
pol_all <- subset(pol_all, (lean_dem %in% c("1", "0")))
# Ratings
pol_all$ratings <- zero1(car::recode(pol_all$politifact.cat.j., "'true'=6;'mostly-true'=5;'half-true'=4;'barely-true'=3;'false'=2;'pants-fire'=1"))
# 1. Selection Bias
# -----------------------------
mean(pol_all$lean_dem==1)
# [1] 0.4434629
mean(pol_all$lean_dem==0)
# 2. Batting Average---assuming no bias in selection of statements
# -----------------------------------------------------------------
sum(pol_all$ratings[pol_all$lean_dem==1])/sum(pol_all$lean_dem==1)
# [1] 0.6329349
sum(pol_all$ratings[pol_all$lean_dem==0])/sum(pol_all$lean_dem==0)
# [1] 0.4915344
# 3. By person: Distribution: Is it a consequence of a few people?
# -------------------------------------------------------------------
library(plyr)
library(dplyr)
by_person <- ddply(pol_all, ~page_names, summarise, total=n(), mean_rat = mean(ratings), dem = mean(as.numeric(lean_dem)))
t30_person <- by_person[order(-by_person$total),][1:30,]
# The plot
library(ggplot2)
library(scales)
library(grid)
# Reorder
t30_person$page_names <- factor(t30_person$page_names, levels=t30_person$page_names[order(t30_person$total)], ordered=TRUE)
ggplot(t30_person, aes(total, page_names)) +
geom_point(aes(colour=factor(dem), alpha=.8)) +
ylab("") +
xlab("Total Number of Statements Investigated") +
scale_colour_manual(values=c("#dd3333", "#3333dd")) +
scale_x_continuous(breaks=seq(0, 500, 100), labels=seq(0, 500, 100)) +
theme_minimal(base_size=9) +
theme(panel.grid.major=element_line(color="#F0F0F0",size=.25)) +
theme(panel.grid.minor=element_blank()) +
theme(axis.ticks=element_blank()) +
theme(legend.position="none") +
theme(plot.title=element_text(color="#525252", size=10, vjust=1.25)) +
theme(axis.text.x=element_text(size=9, color="#636363")) +
theme(axis.text.y=element_text(size=9, color="#636363")) +
theme(axis.title.x=element_text(size=10, color="#323232", vjust=0)) +
theme(axis.title.y=element_text(size=10, color="#323232", vjust=1.25)) +
theme(plot.margin = unit(c(0.35, 0.2, 0.3, 0.35), "cm"))
ggsave("figs/t30_total_investigated.pdf")
ggsave("figs/t30_total_investigated.png")
# 4. Since there is such a large skew, we can take Obama out and see again
# -------------------------------------------------------------------------
mean(pol_all$lean_dem[pol_all$page_names!="Barack Obama"]==1)
# [1] 0.3398533
# Let us take till 90th percentile (arbitrary but ok)
quantile(by_person$total, .9)
select_names <- by_person$page_names[by_person$total < quantile(by_person$total, .9)]
mean(pol_all$lean_dem[pol_all$page_names %in% select_names]==1)
# [1] 0.4275583
# 5. Check if selection bias by batting average
# ------------------------------------------------
# Relationship b/w how often an actor is covered and how often they are incorrect
with(by_person, cor(total, mean_rat))
# [1] 0.01176192
with(t30_person, cor(total, mean_rat))
# [1] 0.2451008
# By Party
ddply(by_person, .(dem), function(x) cor(x$mean_rat, x$total))
# dem V1
# 1 0 0.01052708
# 2 1 0.01220616
# Running pearson
ddply(by_person, .(dem), function(x) cor(x$mean_rat, x$total, method="spearman"))
# dem V1
# 1 0 0.0643790413
# 2 1 0.0004538915
ddply(by_person, .(dem), function(x) cor(x$mean_rat, log(x$total)))
# dem V1
# 1 0 0.02467068
# 2 1 0.01017017
ggplot(by_person, aes(total, mean_rat)) +
geom_point(aes(colour=factor(dem), alpha=.8)) +
ylab("Batting Average") +
xlab("Total Number of Statements Investigated") +
scale_colour_manual(values=c("#dd3333", "#3333dd")) +
scale_y_continuous(breaks=seq(0, 1, .25), labels=nolead0s(seq(0, 1, .25))) +
theme_minimal(base_size=9) +
theme(panel.grid.major=element_line(color="#F0F0F0",size=.25)) +
theme(panel.grid.minor=element_blank()) +
theme(axis.ticks=element_blank()) +
theme(legend.position="none") +
theme(plot.title=element_text(color="#525252", size=10, vjust=1.25)) +
theme(axis.text.x=element_text(size=9, color="#636363")) +
theme(axis.text.y=element_text(size=9, color="#636363")) +
theme(axis.title.x=element_text(size=10, color="#323232", vjust=0)) +
theme(axis.title.y=element_text(size=10, color="#323232", vjust=1.25)) +
theme(plot.margin = unit(c(0.35, 0.2, 0.3, 0.35), "cm"))
ggsave("figs/batting_average_total_investigated.pdf")
ggsave("figs/batting_average_total_investigated.png")