library(XML)nba <- NULLfor (i in 1:11) { temp <- readHTMLTable( sprintf("http://espn.go.com/nba/salaries/_/page/%d",i))[[1]] nba <- rbind(nba, temp)}glimpse(nba)#> Observations: 473#> Variables: 4#> $ RK <fctr> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, RK, 11, 12, 13, 14, 15,...#> $ NAME <fctr> Stephen Curry, PG, LeBron James, SF, Paul Millsap, PF,...#> $ TEAM <fctr> Golden State Warriors, Cleveland Cavaliers, Denver Nug...#> $ SALARY <fctr> $34,382,550, $33,285,709, $31,269,231, $29,727,900, $2...
head(nba$SALARY)# get rid of $ and , in salaries and convert to numeric:gsub("[$,]", "", head(as.character(nba$SALARY)))nba$SALARY <- as.numeric(gsub("[$,]", "", as.character(nba$SALARY)))
#> [1] $34,382,550 $33,285,709 $31,269,231 $29,727,900 $29,512,900 $28,703,704#> 308 Levels: $19,332,500 $19,508,958 $19,578,455 $20,000,000 ... $895,197#> [1] "34382550" "33285709" "31269231" "29727900" "29512900" "28703704"#> Warning: NAs introduced by coercion
nba %>% filter(is.na(SALARY)) %>% head()
#> RK NAME TEAM SALARY#> 1 RK NAME TEAM NA#> 2 RK NAME TEAM NA#> 3 RK NAME TEAM NA#> 4 RK NAME TEAM NA#> 5 RK NAME TEAM NA#> 6 RK NAME TEAM NA
dim(nba)nba <- nba[-which(nba$RK=="RK"),]dim(nba)
#> [1] 473 4#> [1] 440 4
nba <- nba %>% mutate(NAME = as.character(nba$NAME)) %>% separate(NAME, c("full_name", "position"), ",") %>% separate(full_name, c("first", "last"), " ")
#> RK first last position TEAM SALARY#> 1 1 Stephen Curry PG Golden State Warriors 34382550#> 2 2 LeBron James SF Cleveland Cavaliers 33285709#> 3 3 Paul Millsap PF Denver Nuggets 31269231#> 4 4 Gordon Hayward SF Boston Celtics 29727900#> 5 5 Blake Griffin PF LA Clippers 29512900#> 6 6 Kyle Lowry PG Toronto Raptors 28703704
ggplot(data=nba, aes(x=SALARY)) + geom_histogram()
$$X = \left[ \begin{array}{rrrr}
X_1 & X_2 & ... & X_p
\end{array} \right] \\
= \left[ \begin{array}{rrrr}
X_{11} & X_{12} & ... & X_{1p} \\
X_{21} & X_{22} & ... & X_{2p} \\
\vdots & \vdots & \ddots& \vdots \\
X_{n1} & X_{n2} & ... & X_{np}
\end{array} \right]$$
\(X_1 \sim N(0,1), ~~X_2 \sim exp(1) ...\)
\(n\)
iid random variates \(\bar{X}_1=\sum_{i=1}^n X_{i1}\)
, \(s_1^2=\frac{1}{n-1}\sum_{i=1}^n(X_{i1}-\bar{X}_1)^2\)
\(n\)
. messy_data <- read_csv("../data/tb.csv")head(messy_data)#> # A tibble: 6 x 22#> iso2 year m_04 m_514 m_014 m_1524 m_2534 m_3544 m_4554 m_5564 m_65#> <chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>#> 1 AD 1989 NA NA NA NA NA NA NA NA NA#> 2 AD 1990 NA NA NA NA NA NA NA NA NA#> 3 AD 1991 NA NA NA NA NA NA NA NA NA#> 4 AD 1992 NA NA NA NA NA NA NA NA NA#> 5 AD 1993 NA NA NA NA NA NA NA NA NA#> 6 AD 1994 NA NA NA NA NA NA NA NA NA#> # ... with 11 more variables: m_u <int>, f_04 <int>, f_514 <int>,#> # f_014 <int>, f_1524 <int>, f_2534 <int>, f_3544 <int>, f_4554 <int>,#> # f_5564 <int>, f_65 <int>, f_u <int>
tidy_data <- messy_data %>% gather(demo, count, -year, -iso2, na.rm = TRUE) %>% separate(demo, c("gender", "age"))tidy_data <- tidy_data %>% filter(!(age %in% c("014", "04", "514", "u")))head(tidy_data)#> # A tibble: 6 x 5#> iso2 year gender age count#> <chr> <int> <chr> <chr> <int>#> 1 AD 1996 m 1524 0#> 2 AD 1997 m 1524 0#> 3 AD 1998 m 1524 0#> 4 AD 1999 m 1524 0#> 5 AD 2000 m 1524 0#> 6 AD 2002 m 1524 0
tidy_data %>% filter(iso2 == "AU") %>% ggplot(aes(x = year, y = count, fill = gender)) + geom_bar(stat = "identity", position = "fill") + facet_grid(~ age) + scale_fill_brewer(palette="Dark2")
tidy_data %>% filter(iso2 == "AU") %>% ggplot(aes(x = year, y = count, fill = gender)) + geom_bar(stat = "identity") + facet_grid(~ age) + scale_fill_brewer(palette="Dark2") + theme( axis.text = element_text(size = 14), strip.text = element_text(size = 16), axis.title = element_text(size = 16) )
tidy_data %>% filter(iso2 == "AU") %>% ggplot(aes(x = year, y = count, fill = gender)) + geom_bar(stat = "identity", position="dodge") + facet_grid(~ age) + scale_fill_brewer(palette="Dark2") + theme( axis.text = element_text(size = 14), strip.text = element_text(size = 16), axis.title = element_text(size = 16) )
tidy_data %>% filter(iso2 == "AU") %>% ggplot(aes(x = year, y = count, fill = gender)) + geom_bar(stat = "identity") + facet_grid(gender ~ age) + scale_fill_brewer(palette="Dark2") + theme( axis.text = element_text(size = 14), strip.text = element_text(size = 16), axis.title = element_text(size = 16) )
tidy_data %>% filter(iso2 == "AU") %>% ggplot(aes(x = year, y = count, fill = gender)) + geom_bar(stat = "identity") + facet_grid(gender ~ age) + scale_fill_brewer(palette="Dark2") + theme( axis.text = element_blank(), strip.text = element_text(size = 16), axis.title = element_text(size = 16) ) + coord_polar()
tidy_data %>% filter(iso2 == "AU") %>% ggplot(aes(x = 1, y = count, fill = factor(year))) + geom_bar(stat = "identity", position="fill") + facet_grid(gender ~ age) + theme( axis.text = element_blank(), strip.text = element_text(size = 16), axis.title = element_text(size = 16) )
tidy_data %>% filter(iso2 == "AU") %>% ggplot(aes(x = 1, y = count, fill = factor(year))) + geom_bar(stat = "identity", position="fill") + facet_grid(gender ~ age) + theme( axis.text = element_blank(), strip.text = element_text(size = 16), axis.title = element_text(size = 16) ) + coord_polar(theta="y")
What do you learn about tb incidence in the Australia by gender and age and year from the
\(H_0\)
, \(H_1\)
\(H_0\)
: no pattern, \(H_1\)
: pattern, but the choice of plot makes this much more explicit\(H_0\)
: no difference, \(H_1\)
: difference\(H_0\)
: no difference, \(H_1\)
: difference\(H_0\)
: no difference, \(H_1\)
: difference\(H_0\)
: no difference, \(H_1\)
: differenceEmbed the data plot in a field of null plots
Which plot shows the most difference between the groups?
\(p\)
-valuesSuppose \(x\)
individuals selected the data plot from a lineup of \(m\)
plots, shown to \(K\)
independent observers, then simplistically we can think about the probability of this happening, if the data plot is from the same distribution as the null plots. This yields a binomial formula:
$$P(X\geq x) = \sum_{i=x}^{K} \binom{K}{i} \left(\frac{1}{m}\right)^i\left(\frac{m-1}{m}\right)^{K-i}$$
For \(x=4, K=17, m=20\)
pvisual(4, 17, m=20)#> x simulated binom#> [1,] 4 0.021 0.008800605
Crucial idea: assign a p-value to each plot (data and null); under null hypothesis, this p-value is from U[0,1]
Scenario I:
\(k\)
th lineup evaluation do:\(p\)
-values from \(U[0,1]\)
\(q = (1-p_\text{data})/\sum_j(1-p_j)\)
\(q\)
to determine whether data was picked in simulation: \(x_k \tilde B_{1,q}\)
\(K\)
times, and find the number of data picks \(X = \sum_k x_k\)
\(X\)
Scenario II (same data, different nulls):
\(k\)
th lineup evaluation pick 20 \(p\)
-values from \(U[0,1]\)
:\(q = (1-p_\text{data})/\sum_j(1-p_j)\)
\(q\)
to determine whether data was picked in simulation: \(x_k \tilde B_{1,q}\)
\(X = \sum_k x_k\)
\(X\)
Scenario III (same data, same nulls):
\(k\)
th lineup evaluation pick \(p_\text{data} \sim U[0,1]\)
:\(p\)
-values from \(U[0,1]\)
\(q = (1-p_\text{data})/\sum_j(1-p_j)\)
\(X ~ B_{K, q}\)
\(X\)
For these plot descriptions, decide on:
Can you find the odd one out?
Is it easier now?
ggplot(fly_sub, aes(x=`In general, is itrude to bring a baby on a plane?`, fill=Gender)) + scale_fill_brewer(palette="Dark2") + geom_bar(position="fill") + coord_flip() + facet_wrap(~Age, ncol=5)
With this arrangement we can see proportion of gender across rudeness category, within age groups.
ggplot(fly_sub, aes(x=Gender, fill=`In general, is itrude to bring a baby on a plane?`)) + geom_bar(position="fill") + coord_flip() + facet_wrap(~Age, ncol=5) + scale_fill_brewer(palette="Dark2") + theme(legend.position="bottom")
Now we can see proportion of rudeness category across gender, within age groups.
ggplot(fly_sub, aes(x=Age, fill=`In general, is itrude to bring a baby on a plane?`)) + geom_bar(position="fill") + coord_flip() + facet_wrap(~Gender, ncol=5) + scale_fill_brewer(palette="Dark2") + theme(legend.position="bottom")
And, now we can see proportion of rudeness category across age groups, within gender.
This work is licensed under a Creative Commons Attribution 4.0 International License.
Keyboard shortcuts
↑, ←, Pg Up, k | Go to previous slide |
↓, →, Pg Dn, Space, j | Go to next slide |
Home | Go to first slide |
End | Go to last slide |
Number + Return | Go to specific slide |
b / m / f | Toggle blackout / mirrored / fullscreen mode |
c | Clone slideshow |
p | Toggle presenter mode |
t | Restart the presentation timer |
?, h | Toggle this help |
Esc | Back to slideshow |