Corpus Linguistics with R, Day 1

On July 28, 2009, in Code, by cornelius

(This post documents the first day of a class on R that I took at ESU C&T. I is posted here purely for my own use.)


R Lesson 1

> 2+3; 2/3; 2^3
[1] 5
[1] 0.6666667
[1] 8

---

Fundamentals - Functions

> log(x=1000, base=10)
[1] 3

---

(Formals describes the syntax of other functions)

formals(sample)

---

Variables

( <- allows you to save something in a data structure (variable) )
> a<-2+3
> a
[1] 5

# is for comments

whitespace doesn't matter

---
# Pick files
file.choose()

# Get working dir
getwd()

# Set working dir
setwd("..")

# Save
> save(VARIABLE_NAME, file=file.choose())
Fehler in save(test, file = file.choose()) : Objekt ‘test’ nicht gefunden
> save.image("FILE_NAME")

---

> setwd("/home/cornelius/Code/samples/Brown_95perc")
> getwd()
[1] "/home/cornelius/Code/samples/Brown_95perc"
> dir()

> my_array <- c(1,2,3,4)
> my_array
[1] 1 2 3 4
> my_array <- c("lalala", "lululu", "bla")
> my_array2 <- c(1,2,3,4)
> c(my_array, my_array2)
[1] "lalala" "lululu" "bla" "1" "2" "3" "4"
>

# it is possible to add something to ALL values in a vector, i.e.
my_array2 + 10

# c (conc) makes a list
stuff1<-c(1,2,3,4,5)

---

# sequence starts at 1 (first arg), goes on for 5 (second arg), increments by 1 (third arg)
seq(1, 5, 1)

---

# put a file into a corpus vector
# what=real|char sep=seperator
> my_corpus<-scan(file=file.choose(), what="char", sep="\n")

# unique elements in my array
unique(array)

# count elements in an array
table(array)

# sort elements in an array
sort(table(array))

---
# this tells me the position of the elements in my text that aren't "this"
> values<-which(my_little_corpus!="this")
> values
[1] 2 3 4 5 6 7 8 9 11 12 13 14

# this will produce TRUE|FALSE for my condition (is this element "this")
> values<-my_little_corpus!="this"
> values
[1] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE
[13] TRUE TRUE

# this will return the array without "this"
> values<-my_little_corpus[my_little_corpus!="this"]
> values
[1] "is" "just" "a" "little" "example" "bla" "bla"
[8] "bla" "is" "the" "third" "line"

...

> cc<-c("banana", "bagel")
> cc == "banana"; cc!="banana" #
[1] TRUE FALSE
[1] FALSE TRUE
> "banana" %in% cc
[1] TRUE
> c("bagel", "banana") %in% cc
[1] TRUE TRUE
> match ("banana", cc)
[1] 1
> match (c("bagel","banana"), cc)
[1] 2 1

# match looks for a list of tokens and returns their position in the datastructure

---
> cat(bb, sep="\n", file=scan(what="char"), append=F)
# write the contents of bb to a file, ask the user for file

moo<-scan(what="char")
# read something the user types into a var

# Clear Mem
> rm(list=ls(all=T))
>

---

# create vector1 (ordered)
vec1<-c("a","b","c","d","e","f,",g","h","i","j")

# oder
# > letters[1:10]
# [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j"

# create vector2 (random)
# > vector2<-sample(vector1)

---

length()
# number of elements

nchar()
# number of characters

> aa<-"know"
> nchar(aa)
[1] 4
> aa<-c("I","do","not","know")
> nchar(aa)
[1] 1 2 3 4
> lala<-c("cat","gnu","hippopotamus")
> lala
[1] "cat" "gnu" "hippopotamus"
> nchar(lala)
[1] 3 3 12

> substr("hippopotamus", 0, 5)
[1] "hippo"
>

# like explode() / implode()
paste (string, sep="my_seperator", collapse="stuff to put in")

---

# percentages
x/sum(x)

barplot (1,2,3)

Read in corpus data and build a list of words frequencies
1) scan file
2) strsplit by " "
3) unlist to make vector
4) make a table with freqs
5) sort
6) output

#search for strings
grep("needle", haystack)

> grep("is", text, value=T)
[1] "This is a first example sentence."
[2] "And this is a second example sentence."
> grep("And", text, value=T)
[1] "And this is a second example sentence."
> grep("sentence", text, value=T)
[1] "This is a first example sentence."
[2] "And this is a second example sentence."
>

gregexpr
# alternative to grep, returns a list of vectors

> mat<-gregexpr("e", text)
> mat
[[1]]
[1] 17 23 26 29 32
attr(,"match.length")
[1] 1 1 1 1 1

[[2]]
[1] 16 22 28 31 34 37
attr(,"match.length")
[1] 1 1 1 1 1 1

> unlist(mat)
[1] 17 23 26 29 32 16 22 28 31 34 37
> mat<-gregexpr("sentence", text)
> sapply (mat, c)
[1] 25 30

Tagged with:  

Comments are closed.