-
Notifications
You must be signed in to change notification settings - Fork 3
/
kouprianov.r.data_transformations.r
executable file
·360 lines (265 loc) · 10.8 KB
/
kouprianov.r.data_transformations.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
################################################################
################################################################
#
# Some data transformantions
#
################################################################
################################################################
################################################################
# Sorting
# Generalised form:
# Vectors and Factors:
# order(x) # returns numbers of x elements sorted alphanumerically in ascending order
# order(-x) # returns numbers of x sorted alphanumerically in descending order
# x[order(x)] # returns x sorted alphanumerically in ascending order
# x[order(-x)] # returns x sorted alphanumerically in descending order
# Data frames:
# order(x$y) # returns numbers of x$y elements sorted alphanumerically in ascending order
# order(-x$y) # returns numbers of x$y elements sorted alphanumerically in descending order
# x[order(x$y),] # returns x sorted alphanumerically in ascending order of x$y
# x[order(-x$y),] # returns x sorted alphanumerically in descending order of x$y
# Examples
phil <- read.table("philosophers.txt", h=TRUE, sep="\t")
phil
phil.snd <- phil[order(-phil$NAMES),]
phil.snd
phil.sfa <- phil[order(phil$Freq),]
phil.sfa
phil.sfd <- phil[order(-phil$Freq),]
phil.sfd
################################################################
# Subsetting
# Generalised form:
# subset(x, conditions) # returns elements of x that meet the conditions
# Conditions can be specified in several ways:
# x == value # returns elements of x that are equivalent to the value (for all kinds of values except NA)
# x != value # returns elements of x that are not equivalent to the value (for all kinds of values except NA)
# x >= value # returns elements of x that are greater than or equal to the value (for numerical values only)
# x <= value # returns elements of x that are less than or equal to the value (for numerical values only)
# x > value # returns elements of x that are greater than the value (for numerical values only)
# x < value # returns elements of x that are less than the value (for numerical values only)
# Conditions can be combined:
# & # means logical AND
# | # means logical OR
# () # define the priorities in aplication of the conditions
# Examples:
# subset(x, condition.1 & condition.2)
# subset(x, condition.1 | condition.2)
# subset(x, condition.1 & (condition.2 | condition.3))
# Subset can be applied to both vectors/factors and data frames:
#
# Examples:
test <- rep(c(1:4),3)
test
subset(test, test > 2)
subset(test, test >= 1 & test < 3)
dim(students.df)
summary(students.df)
students.f.df <- subset(students.df, students.df$SEX == "f")
dim(students.f.df)
summary(students.f.df)
students.f.le155.df <- subset(students.df, students.df$SEX == "f" & students.df$HEIGHT <= 155)
dim(students.f.le155.df)
summary(students.f.le155.df)
################################################################
# Working with lists of values
# Generalised form:
# unique(x) # returns list of unique elements present in x
# intersect(x, y) # returns list of elements common to x and y
# setdiff(x, y) # returns elements present in x but not in y
test.201802261402.01 <- c("a","a","b","b","c","c","d","d")
test.201802261402.02 <- c("c","d","e","f")
test.201802261402.01
test.201802261402.02
unique(test.201802261402.01)
intersect(test.201802261402.01, test.201802261402.02)
intersect(test.201802261402.02, test.201802261402.01)
setdiff(test.201802261402.01, test.201802261402.02)
setdiff(test.201802261402.02, test.201802261402.01)
################################################################
# The for(){} loop and its use for subsetting
# Generalised form:
# for(i in x){
# looped operations
# }
# All variables used for looped operations must be created outside of the loop
# x : a vector of numerical values used sequentially as values for i in the loop iterations
# looped operations : any operations that are required
# Examples:
# The following script subsets data for dirfferent regions of the pres.2008 into elements of regions.ls list object
# The following code brings exactly the same results as the code for the while(){} loop in the following subsection; compare them, please
# The key difference is the way in which i values are specified; both ways are rather flexible, though in some cases one could be more convenient than the other
# Obtaining a sorted list of region names
regions.list <- unique(pres.2008$REGION)
regions.list <- regions.list[order(regions.list)]
# Creating regions.ls list object to be used later within the loop
regions.ls <- NULL
regions.ls <- as.list(regions.ls)
for(i in 1:length(regions.list)){ # starting the loop
regions.ls[[i]] <- subset(pres.2008 <- pres.2008$REGION == regions.list[i]) # subsetting lines with pres.2008$REGION equivalent to regions.list[i]
}
# Previewing two regions with summary() to observe the results
summary(regions.ls[[1]])
summary(regions.ls[[50]])
################################################################
# The while(){} loop and its use for subsetting
# Generalised form:
# i <- start.value.i
# while(condition.i){
# looped operations
# i <- i + increment.i
# }
# All variables used for looped operations must be created outside of the loop
# condition.i : compares the counter variable (i) to some value which should be reached by increments
# looped operations : any operations that are required
# increment.i : some value (positive or negative) that is added to the counter (i) at the end of each cyclus
# Cycles can be nested
# Generalised form:
# i <- start.value.i
# j <- NULL
# while(condition.i){
# j <- start.value.j
# while(condition.j){
# looped operations
# j <- j + increment.j
# }
# i <- i + increment.i
# }
# Examples:
# The following script subsets data for dirfferent regions of the pres.2008 into elements of regions.ls list object
# The following code brings exactly the same results as the code for the for(){} loop in the preceding subsection; compare them, please
# The key difference is the way in which i values are specified; both ways are rather flexible, though in some cases one could be more convenient than the other
# Obtaining a sorted list of region names
regions.list <- unique(pres.2008$REGION)
regions.list <- regions.list[order(regions.list)]
# Creating regions.ls list object to be used later within the loop
regions.ls <- NULL
regions.ls <- as.list(regions.ls)
i <- 1 # assigning start value to the counter
while(i <= length(regions.list)){ # specifying condition
regions.ls[[i]] <- subset(pres.2008 <- pres.2008$REGION == regions.list[i]) # subsetting lines with pres.2008$REGION equivalent to regions.list[i]
i <- i + 1 # adding counter increment
}
# Previewing two regions with summary() to observe the results
summary(regions.ls[[1]])
summary(regions.ls[[50]])
################################################################
# Binding vectors together: cbind(), rbind(),
# cbind.data.frame(), and rbind.data.frame()
# Generalised form:
# x12 <- cbind(x1, x2)
# x12 <- rbind(x1, x2)
# x12 <- cbind.data.frame(x1, x2)
# x12 <- rbind.data.frame(x1, x2)
# Note, please, that cbind(), which BINDs Columns together,
# and rbind(), which BINDs Rows together, may corrupt data types
# when used carelessly. They produce data frames at the output
# only if one of the input objects is a data frame. When binding
# vectors they end up with a matrix (converting everything to character
# data type if the vectors are of different data types).
#
# For data.frames, it is safer to use cbind.data.frame() and rbind.data.frame()
################################################################
# Transforming data types
# Generalised form:
# x.factor <- as.factor(x.chr) # converts a character vector to a factor;
# x.chr <- as.character(x.factor) # converts a factor to a character vector;
#
# x.num <- as.numeric(x.factor) # converts numeric codes of the facor levels to numeric (integer) values;
# x.factor <- as.factor(x.num) # converts numeric values to factor levels;
#
# x.num <- as.numeric(x.chr) # If and only if x.chr contains numbers only;
# x.chr <- as.character(x.numeric) # converts a vector of numeric values to a character vector;
# Creating initial vector of text values;
x.chr <- c("dog","mouse","cat","cat")
# Transforming data types;
x.factor <- as.factor(x.chr)
x.num <- as.numeric(x.factor)
x.chr.1 <- as.character(x.num)
x.chr.2 <- as.character(x.factor)
x.num.1 <- as.numeric(x.chr.1)
x.factor.1 <- as.factor(x.num)
# Previewing vectors;
x.chr
x.factor
x.num
x.chr.1
x.chr.2
x.num.1
x.factor.1
# Assessing vectors' structure;
str(x.chr)
str(x.factor)
str(x.num)
str(x.chr.1)
str(x.chr.2)
str(x.num.1)
str(x.factor.1)
# Checking vectors' data type (only for some most instructive vectors);
is.character(x.chr)
is.factor(x.chr)
is.numeric(x.chr)
is.character(x.factor)
is.factor(x.factor)
is.numeric(x.factor)
is.character(x.num)
is.factor(x.num)
is.numeric(x.num)
is.character(x.chr.1)
is.factor(x.chr.1)
is.numeric(x.chr.1)
is.character(x.factor.1)
is.factor(x.factor.1)
is.numeric(x.factor.1)
################################################################
# Managing factor levels: factor(), levels()
# Generalised form:
# x.factor <- as.factor(x.chr)
# x.factor <- factor(x.chr)
# levels(x.factor)
# x.factor.y <- factor(x.chr, levels=y)
#
# x.chr <- x.factor.y
# x.factor.z <- factor(x.chr, levels=z)
################################################################
# Merging two data frames into one: merge()
# Generalised form:
# x12 <- merge(x1, x2)
# Sometimes, you need to unite two data frames in which some columns
# can be identified with each other while the others can not.
# E. g., you managed to obtain variables A, B, and C for the dataset X1
# and A, B, C, and D for the dataset X2. Even though it would not be
# possible to consider interdependence of A and D in all the data available,
# you can still do it for A, B, snd C on a combined dataset. Of course,
# you can always play around with c(), rbind.data.frame(), data type
# transformations and factor levels (if someting goes wrong),
# but there is a quicker and safer way.
#
# merge() function can carefully join two data frames keeping all variables
# or skipping the ones, which do not match.
# Creating experimental data frame 1:
X1 <- cbind.data.frame(
c(1),
sample(100, 10),
rnorm(10, 20, 4),
factor(c(rep("f", 5), rep("m", 5)), levels=c("f","m"))
)
# Assigning variable names:
colnames(X1) <- c("X", "A", "B", "C")
# Creating experimental data frame 2:
X2 <- cbind.data.frame(
c(2),
sample(100, 10),
rnorm(10, 20, 4),
sample(100, 10),
factor(c("f","m"), levels=c("f","m"))
)
# Assigning variable names:
colnames(X2) <- c("X", "A", "B", "D", "C")
# Merging two data frames:
X12 <- merge(X1, X2, all=TRUE)
# Read carefully help(merge), please. It is barely understandable but useful.
# Previewing the experimental data.frames and the result of merge()
X1
X2
X12