Commit a8b026b8 authored by Zheng Liu's avatar Zheng Liu

back up

parent 4caa6740
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, Text
Base = declarative_base()
class Comment(Base):
__tablename__ = 'comment'
id = Column(Integer, primary_key=True)
comment = Column(Text)
engine = create_engine('sqlite:///BENM.db')
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
## Editor
vals <- ...
vals <- seq(...,
...)
## Load Data
storm <- read.csv(...)
storm <- read.csv(
...,
...)
## Lists
x <- list('abc', 1:3, sin)
## Factors
education <- ...(
c('college', 'highschool', 'college', 'middle', 'middle'),
... = c('middle', 'highschool', ...))
## Data Frames
... <- c(32000, 28000, 89000, 0, 0)
... <- data.frame(...)
## Names
...(df) <- c(...)
## Subsetting ranges
days <- c(
'Sunday', 'Monday', 'Tuesday',
'Wednesday', 'Thursday', 'Friday',
'Saturday')
weekdays <- ...
...
## Functions
function(...) {
...
return(...)
}
## Flow Control
first <- function(dat) {
... {
result <- dat[[1]]
} ... {
result <- dat[1, ]
}
return(result)
}
## Distributions and Statistics
...(n = 10)
x <- rnorm(n = 100, ... = 15, sd = 7)
y <- ...(n = 100, ... = 20, prob = .85)
## RegEx
library(stringr)
str_extract_all(
'Email info@sesync.org or tweet @SESYNC',
'\\b\\S+@\\S+\\b'
)
library(tm)
enron <- VCorpus(DirSource("data/enron"))
email <- enron[[1]]
match <- str_match(content(email), '^From: (.*)')
head(match)
match[[3,2]]
txt <- ...
str_match(txt, '...')
## Data Extraction
enron <- tm_map(enron, function(email) {
body <- content(email)
match <- str_match(body, '^From: (.*)')
match <- na.omit(match)
meta(email, 'author') <- match[[1, 2]]
return(email)
})
email<-enron[[1]]
meta(email)
## Relational Data Exrtraction
get_to <- function(email) {
body <- content(email)
match <- str_detect(body, '^To:')
if (any(match)) {
to_start <- which(match)[[1]]
match <- str_detect(body, '^Subject:')
to_end <- which(match)[[1]] - 1
to <- paste(body[to_start:to_end], collapse = '')
to <- str_extract_all(to, '\\b\\S+@\\S+\\b')
return(unlist(to))
} else {
return(NA)
}
}
edges <- lapply(enron, FUN = function(email) {
from <- meta(email, 'author')
to <- get_to(email)
return(cbind(from, to))
})
edges <- do.call(rbind, edges)
edges <- na.omit(edges)
attr(edges, 'na.action') <- NULL
library(network)
g <- network(edges)
plot(g)
## Text Mining
enron <- tm_map(enron, function(email) {
body <- content(email)
match <- str_detect(body, '^X-FileName:')
begin <- which(match)[[1]] + 1
match <- str_detect(body, '^[>\\s]*[_\\-]{2}')
match <- c(match, TRUE)
end <- which(match)[[1]] - 1
content(email) <- body[begin:end]
return(email)
})
## Cleaning Text
library(magrittr)
enron_words <- enron %>%
tm_map(removePunctuation) %>%
tm_map(removeNumbers) %>%
tm_map(stripWhitespace)
remove_link <- function(body) {
match <- str_detect(body, '(http|www|mailto)')
body[!match]
}
enron_words <- enron_words %>%
tm_map(content_transformer(remove_link))
## Stopwords and Stems
enron_words <- enron_words %>%
tm_map(stemDocument) %>%
tm_map(removeWords, stopwords("english"))
## Bag-of-Words
dtm <- DocumentTermMatrix(enron_words)
dtm
## Long Form
library(tidytext)
library(dplyr)
dtt <- tidy(dtm)
words <- dtt %>%
group_by(term) %>%
summarise(
n = n(),
total = sum(count)) %>%
mutate(nchar = nchar(term))
library(ggplot2)
ggplot(words, aes(x=nchar)) +
geom_histogram(binwidth = 1)
dtt_trimmed <- words %>%
filter(
nchar < 16,
n > 1,
total > 3) %>%
select(term) %>%
inner_join(dtt)
dtm_trimmed <- dtt_trimmed %>%
cast_dtm(document, term, count)
dtm_trimmed
## Term Correlations
word_assoc <- findAssocs(dtm_trimmed, 'ken', 0.6)
word_assoc <- data.frame(
word = names(word_assoc[[1]]),
assoc = word_assoc,
row.names = NULL)
## Latent Dirichlet allocation
library(topicmodels)
seed = 12345
fit = LDA(dtm_trimmed, k = 5, control = list(seed=seed))
email_topics <- as.data.frame(
posterior(fit, dtm_trimmed)$topics)
head(email_topics)
library(ggwordcloud)
topics <- tidy(fit) %>%
filter(beta > 0.004)
ggplot(topics,
aes(size = beta, label = term)) +
geom_text_wordcloud_area(rm_outside = TRUE) +
facet_wrap(vars(topic))
# Documenting and Publishing your Data Worksheet
# Preparing Data for Publication
library(tidyverse)
stm_dat <- read_csv("data/StormEvents.csv")
head(stm_dat)
str(stm_dat)
unique(stm_dat$EVENT_NARRATIVE)
dir.create('storm_project', showWarnings = FALSE)
write_csv(stm_dat, "storm_project/StormEvents_d2006.csv")
# Creating metadata
library(dataspice) ; library(here)
create_spice(dir = "storm_project")
range(stm_dat$YEAR)
range(stm_dat$BEGIN_LAT, na.rm=TRUE)
range(stm_dat$BEGIN_LON, na.rm=TRUE)
edit_biblio(metadata_dir = here("storm_project", "metadata"))
edit_creators(metadata_dir = here("storm_project", "metadata"))
prep_access(data_path = here("storm_project"),
access_path = here("storm_project", "metadata", "access.csv"))
edit_access(metadata_dir = here("storm_project", "metadata"))
prep_attributes(data_path = here("storm_project"),
attributes_path = here("storm_project", "metadata", "attributes.csv"))
edit_attributes(metadata_dir = here("storm_project", "metadata"))
write_spice(path = here("storm_project", "metadata"))
library(datapack) ; library(...) ; library(...)
json <- ...("storm_project/metadata/dataspice.json")
eml <- ...(json)
...(eml, "storm_project/metadata/dataspice.xml")
# Creating a data package
library(datapack) ; library(uuid)
dp <- new("DataPackage") # create empty data package
... <- "storm_project/metadata/dataspice.xml"
... <- paste("urn:uuid:", UUIDgenerate(), sep = "")
... <- new("DataObject", id = ..., format = "eml://ecoinformatics.org/eml-2.1.1", file = ...)
dp <- ...(dp, ...) # add metadata file to data package
... <- "storm_project/StormEvents_d2006.csv"
... <- paste("urn:uuid:", UUIDgenerate(), sep = "")
... <- new("DataObject", id = ..., format = "text/csv", filename = ...)
dp <- ...(dp, ...) # add data file to data package
dp <- ...(dp, subjectID = ..., objectIDs = ...)
serializationId <- paste("resourceMap", UUIDgenerate(), sep = "")
filePath <- file.path(sprintf("%s/%s.rdf", tempdir(), serializationId))
status <- serializePackage(..., filePath, id=serializationId, resolveURI = "")
... <- serializeToBagIt(...) # right now this creates a zipped file in the tmp directory
file.copy(..., "storm_project/Storm_dp.zip") # now we have to move the file out of the tmp directory
# this is a static copy of the DataONE member nodes as of July, 2019
read.csv("data/Nodes.csv")
# Getting Started
library(readr)
person <- read_csv(
file = 'data/census_pums/sample.csv',
col_types = cols_only(
... = 'i',
... = 'd',
... = 'c',
... = 'c'))
## Layered Grammar
library(...)
...(person, ...(x = WAGP)) +
..._histogram()
library(dplyr)
person <- filter(
person,
WAGP > 0,
WAGP < max(WAGP, na.rm = TRUE))
ggplot(person,
aes(...)) +
geom_point()
ggplot(person,
aes(x = SCHL, y = WAGP)) +
...
## Layer Customization
ggplot(person,
aes(x = SCHL, y = WAGP)) +
geom_boxplot(...) +
geom_point()
ggplot(person,
aes(x = SCHL, y = WAGP)) +
geom_boxplot() +
geom_point(
... = 'red',
... = 'summary',
fun.y = ...)
## Adding Aesthetics
ggplot(person,
aes(x = SCHL, y = WAGP, ...)) +
geom_boxplot()
person$SEX <- factor(person$SEX, levels = ...)
ggplot(person,
aes(x = SCHL, y = WAGP, color = SEX)) +
geom_boxplot()
# Storing and Re-plotting
... ggplot(person,
aes(x = SCHL, y = WAGP, color = SEX)) +
geom_point(
stat = 'summary',
fun.y = 'mean')
schl_wagp <- ... +
scale_color_manual(
values = c('black', 'red'))
ggsave(...,
plot = ...,
width = 4, height = 3)
# Smooth Lines
ggplot(person,
aes(x = SEX, y = WAGP)) +
geom_point() +
...(
method = ...,
aes(group = 0))
# Axes, Labels and Themes
sex_wagp <- ggplot(person,
aes(x = SEX, y = WAGP)) +
geom_point() +
geom_smooth(
method = 'lm',
aes(group = 0))
sex_wagp + ...(
... = 'Wage Gap',
x = ...,
... = 'Wages (Unadjusted USD)')
sex_wagp + ...(
trans = 'log10')
sex_wagp + ...()
sex_wagp + theme_bw() +
labs(title = 'Wage Gap') +
theme(
... = element_text(
face = 'bold',
hjust = 0.5))
# Facets
person$SCHL <- factor(person$SCHL)
levels(person$SCHL) <- list(
'High School' = '16',
'Bachelor\'s' = '21',
'Master\'s' = '22',
'Doctorate' = '24')
ggplot(...,
aes(x = SEX, y = WAGP)) +
geom_point() +
geom_smooth(
method = 'lm',
aes(group = 0)) +
...
file.symlink(
from = '/nfs/public-data/training',
to = 'data'
)
# configure git
git config --global user.name "Zheng Liu"
git config --global user.email "liuliuzheng1208@hotmail.com"
git commit --no-edit --amend --reset-author
# Link your local repository to the origin repository on GitHub, by
# copying the code shown on your GitHub repo under the heading:
# "…or push an existing repository from the command line"
## Tidy Concept
trial <- read.delim(sep = ',', header = TRUE, text = "
block, drug, control, placebo
1, 0.22, 0.58, 0.31
2, 0.12, 0.98, 0.47
3, 0.42, 0.19, 0.40
")
## Gather
library(tidyr)
tidy_trial <- ...(trial,
key = ...,
value = ...,
...)
## Spread
survey <- read.delim(sep = ',', header = TRUE, text = "
participant, attr, val
1 , age, 24
2 , age, 57
3 , age, 13
1 , income, 30
2 , income, 60
")
tidy_survey <- ...(survey,
key = ...,
value = ...)
tidy_survey <- spread(survey,
key = attr,
value = val,
...)
## Sample Data
library(data.table)
cbp <- fread('data/cbp15co.csv')
cbp <- fread(
'data/cbp15co.csv',
...,
...)
acs <- fread(
'data/ACS/sector_ACS_15_5YR_S2413.csv',
colClasses = c(FIPS = 'character'))
## dplyr Functions
library(...)
cbp2 <- filter(...,
...,
!grepl('------', NAICS))
library(...)
cbp2 <- filter(cbp,
...)
cbp3 <- mutate(...,
...)
cbp3 <- mutate(cbp2,
FIPS = str_c(FIPSTATE, FIPSCTY),
...)
...
filter(
str_detect(NAICS, '[0-9]{2}----')
) ...
mutate(
FIPS = str_c(FIPSTATE, FIPSCTY),
NAICS = str_remove(NAICS, '-+')
)
...
...(
FIPS,
NAICS,
starts_with('N')
)
## Join
sector <- fread(
'data/ACS/sector_naics.csv',
colClasses = c(NAICS = 'character'))
cbp <- cbp %>%
...
## Group By
cbp_grouped <- cbp %>%
...
## Summarize
cbp <- cbp %>%
group_by(FIPS, Sector) %>%
...
...
acs_cbp <- ... %>%
...
This source diff could not be displayed because it is too large. You can view the blob instead.
# Linear models
library(readr)
library(dplyr)
library(ggplot2)
person <- read_csv(
file = 'data/census_pums/sample.csv',
col_types = cols_only(
AGEP = 'i', # Age
WAGP = 'd', # Wages or salary income past 12 months
SCHL = 'i', # Educational attainment
SEX = 'f', # Sex
OCCP = 'f', # Occupation recode based on 2010 OCC codes
WKHP = 'i')) # Usual hours worked per week past 12 months
person <- within(person, {
SCHL <- factor(SCHL)
levels(SCHL) <- list(
'Incomplete' = c(1:15),
'High School' = 16,
'College Credit' = 17:20,
'Bachelor\'s' = 21,
'Master\'s' = 22:23,
'Doctorate' = 24)}) %>%
filter(
WAGP > 0,
WAGP < max(WAGP, na.rm = TRUE))
# Formula Notation
fit <- lm(
formula = WAGP ~ SCHL,
data = person)
ggplot(person, aes(x=SCHL,y=WAGP)) + geom_boxplot()
fit
fit <- lm(
formula = log(WAGP) ~ SCHL,
data = person)
summary(fit)
fit <- lm(
formula = log(WAGP) ~ AGEP,
data = person)
summary(fit)
ggplot(person, aes(x=AGEP,y=log(WAGP))) + geom_point()
fit <- lm(
...,
person)
# Metadata matters
fit <- lm(
...,
person)
# GLM families
fit <- glm(log(WAGP)~SCHL,
family = gaussian,
data = person)
summary(fit)
# Logistic Regression
fit <- glm(SEX ~ WAGP,
family = binomial,
data = person)
summary(fit)
ggplot(person, aes(x=WAGP,y=SEX)) + geom_point()
anova(fit,update(fit,))
levels(person$SEX)
anova(fit, update(fit, SEX~1), test = 'Chisq')
# Random Intercept
library(lme4)
fit <- lmer(
log(WAGP) ~ (1|OCCP) + SCHL,
data = person)
summary(fit)
# Random Slope
fit <- lmer(
log(WAGP) ~ (WKHP | SCHL),
data = person)
summary(fit)
fit <- lmer(
log(WAGP) ~ (WKHP | SCHL),
data = person,
control = lmerControl(optimizer = 'bobyqa'))
ggplot(person,
aes(x = WKHP, y = log(WAGP), color = SCHL)) +
geom_point() +
geom_line(aes(y = predict(fit))) +
labs(title = 'Random intercept and slope with lmer')
library(readr)
library(dplyr)
# read in data
cty_to_cty <- readr::read_csv('data/cty-to-cty_clean.csv')
state_movers <- cty_to_cty %>% group_by(current_state) %>%
summarise(sum_new_movers = sum(movers_state_est, na.rm = TRUE)) %>%
arrange(sum_new_movers)
# pretend this takes a really long time!
Sys.sleep(10)
\ No newline at end of file
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Requests\n",
"\n",
"This is a markdown chunk."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"\n",
"response = requests.get('https://xkcd.com/869')\n"
]
},
{
"cell_type": "code",