Download R Studio

Opening Packages
Setting the Working Directory
# 1 Open Windows Explorer
# 2 Navigate to C Drive
# 3 Select "Users"
# 4 Select Desktop
# 5 Copy the Address **BE SURE TO REPLACE THE "\" WITH "/"
setwd("C:/Users/alwalker/Desktop")

## REPLACE "alwalker" WITH YOUR MACHINE'S USERNAME
Uploading A Dataframe and a list
IPEDS <- read_excel("Hands-on Workshop 1-Laura Walker-Data Manipulation and Visualization in R-IPEDS Source Document.xlsx")
manual_colors <- c("#101820","#EAAA00","#AF272F")
Head of a Data Frame
head(IPEDS)
## # A tibble: 6 x 21
##   UnitID Inst_Name        Inst_Sector    Count_Doctor_Deg Count_Masters_D~
##    <dbl> <chr>            <chr>                     <dbl>            <dbl>
## 1 161688 Allegany Colleg~ Public, 2-year               NA               NA
## 2 161767 Anne Arundel Co~ Public, 2-year               NA               NA
## 3 161864 Baltimore City ~ Public, 2-year               NA               NA
## 4 162007 Bowie State Uni~ Public, 4-yea~               10              337
## 5 405872 Carroll Communi~ Public, 2-year               NA               NA
## 6 162104 Cecil College    Public, 2-year               NA               NA
## # ... with 16 more variables: Count_Bachelors_Deg <dbl>,
## #   Count_Assoc_Deg <dbl>, Count_PostBac_PostMasters_Deg <dbl>,
## #   Count_Cert <dbl>, Count_Cert_Less_Year <dbl>,
## #   Grand_Total_All_Students <dbl>, Grand_Total_Female <dbl>,
## #   Grand_Total_Amer_Ind_AK_Native <dbl>, Grand_Total_Asian <dbl>,
## #   Grand_Total_Black_AA <dbl>, Grand_Total_Hispanic <dbl>,
## #   Grand_Total_Native_Hawaiian_PI <dbl>, Grand_Total_White <dbl>,
## #   Grand_Total_Two_More <dbl>, Grand_Total_Race_Unknown <dbl>,
## #   Grand_Total_Nonresident_Alien <dbl>
Viewing the nth Row of Data
# Someone asked how one would view a specific row. Below is the code to see the "tenth" row
IPEDS[10,]
## # A tibble: 1 x 21
##   UnitID Inst_Name          Inst_Sector  Count_Doctor_Deg Count_Masters_D~
##    <dbl> <chr>              <chr>                   <dbl>            <dbl>
## 1 162557 Frederick Communi~ Public, 2-y~               NA               NA
## # ... with 16 more variables: Count_Bachelors_Deg <dbl>,
## #   Count_Assoc_Deg <dbl>, Count_PostBac_PostMasters_Deg <dbl>,
## #   Count_Cert <dbl>, Count_Cert_Less_Year <dbl>,
## #   Grand_Total_All_Students <dbl>, Grand_Total_Female <dbl>,
## #   Grand_Total_Amer_Ind_AK_Native <dbl>, Grand_Total_Asian <dbl>,
## #   Grand_Total_Black_AA <dbl>, Grand_Total_Hispanic <dbl>,
## #   Grand_Total_Native_Hawaiian_PI <dbl>, Grand_Total_White <dbl>,
## #   Grand_Total_Two_More <dbl>, Grand_Total_Race_Unknown <dbl>,
## #   Grand_Total_Nonresident_Alien <dbl>
First Three Rows of a Dataframe
head(IPEDS, 3)
## # A tibble: 3 x 21
##   UnitID Inst_Name          Inst_Sector  Count_Doctor_Deg Count_Masters_D~
##    <dbl> <chr>              <chr>                   <dbl>            <dbl>
## 1 161688 Allegany College ~ Public, 2-y~               NA               NA
## 2 161767 Anne Arundel Comm~ Public, 2-y~               NA               NA
## 3 161864 Baltimore City Co~ Public, 2-y~               NA               NA
## # ... with 16 more variables: Count_Bachelors_Deg <dbl>,
## #   Count_Assoc_Deg <dbl>, Count_PostBac_PostMasters_Deg <dbl>,
## #   Count_Cert <dbl>, Count_Cert_Less_Year <dbl>,
## #   Grand_Total_All_Students <dbl>, Grand_Total_Female <dbl>,
## #   Grand_Total_Amer_Ind_AK_Native <dbl>, Grand_Total_Asian <dbl>,
## #   Grand_Total_Black_AA <dbl>, Grand_Total_Hispanic <dbl>,
## #   Grand_Total_Native_Hawaiian_PI <dbl>, Grand_Total_White <dbl>,
## #   Grand_Total_Two_More <dbl>, Grand_Total_Race_Unknown <dbl>,
## #   Grand_Total_Nonresident_Alien <dbl>
Last Five Rows of a Dataframe
tail(IPEDS, 5)
## # A tibble: 5 x 21
##   UnitID Inst_Name       Inst_Sector     Count_Doctor_Deg Count_Masters_D~
##    <dbl> <chr>           <chr>                      <dbl>            <dbl>
## 1 163286 University of ~ Public, 4-year~              628             2795
## 2 163204 University of ~ Public, 4-year~               46             3667
## 3 162210 Washington Adv~ Private not-fo~                0               86
## 4 164216 Washington Col~ Private not-fo~                0                3
## 5 164313 Wor-Wic Commun~ Public, 2-year                NA               NA
## # ... with 16 more variables: Count_Bachelors_Deg <dbl>,
## #   Count_Assoc_Deg <dbl>, Count_PostBac_PostMasters_Deg <dbl>,
## #   Count_Cert <dbl>, Count_Cert_Less_Year <dbl>,
## #   Grand_Total_All_Students <dbl>, Grand_Total_Female <dbl>,
## #   Grand_Total_Amer_Ind_AK_Native <dbl>, Grand_Total_Asian <dbl>,
## #   Grand_Total_Black_AA <dbl>, Grand_Total_Hispanic <dbl>,
## #   Grand_Total_Native_Hawaiian_PI <dbl>, Grand_Total_White <dbl>,
## #   Grand_Total_Two_More <dbl>, Grand_Total_Race_Unknown <dbl>,
## #   Grand_Total_Nonresident_Alien <dbl>
Summary Function
summary(IPEDS)
##      UnitID        Inst_Name         Inst_Sector        Count_Doctor_Deg
##  Min.   :161688   Length:40          Length:40          Min.   :  0.00  
##  1st Qu.:162489   Class :character   Class :character   1st Qu.:  0.00  
##  Median :163125   Mode  :character   Mode  :character   Median :  5.50  
##  Mean   :175867                                         Mean   :115.88  
##  3rd Qu.:163706                                         3rd Qu.: 73.75  
##  Max.   :434672                                         Max.   :796.00  
##                                                         NA's   :16      
##  Count_Masters_Deg Count_Bachelors_Deg Count_Assoc_Deg 
##  Min.   :   3.0    Min.   :  86.0      Min.   :   0.0  
##  1st Qu.: 122.8    1st Qu.: 365.5      1st Qu.:   0.0  
##  Median : 267.0    Median : 647.0      Median :   0.0  
##  Mean   : 746.8    Mean   :1343.0      Mean   : 402.6  
##  3rd Qu.: 665.2    3rd Qu.:1141.5      3rd Qu.: 611.5  
##  Max.   :4953.0    Max.   :6748.0      Max.   :2519.0  
##  NA's   :16        NA's   :16                          
##  Count_PostBac_PostMasters_Deg   Count_Cert     Count_Cert_Less_Year
##  Min.   :  0                   Min.   :  0.00   Min.   :  0.00      
##  1st Qu.:  0                   1st Qu.:  0.00   1st Qu.:  0.00      
##  Median : 18                   Median :  0.00   Median :  0.00      
##  Mean   : 95                   Mean   : 58.95   Mean   : 49.38      
##  3rd Qu.: 78                   3rd Qu.: 64.75   3rd Qu.: 54.75      
##  Max.   :743                   Max.   :684.00   Max.   :479.00      
##  NA's   :16                                                         
##  Grand_Total_All_Students Grand_Total_Female
##  Min.   :  451            Min.   :  197     
##  1st Qu.: 2607            1st Qu.: 1838     
##  Median : 5078            Median : 3112     
##  Mean   : 8800            Mean   : 4914     
##  3rd Qu.: 8911            3rd Qu.: 5173     
##  Max.   :50248            Max.   :24177     
##                                             
##  Grand_Total_Amer_Ind_AK_Native Grand_Total_Asian Grand_Total_Black_AA
##  Min.   :  1.00                 Min.   :   2.0    Min.   :   11       
##  1st Qu.:  7.75                 1st Qu.:  57.0    1st Qu.:  377       
##  Median : 12.50                 Median : 107.0    Median : 1088       
##  Mean   : 28.65                 Mean   : 604.5    Mean   : 2337       
##  3rd Qu.: 26.50                 3rd Qu.: 538.5    3rd Qu.: 2816       
##  Max.   :234.00                 Max.   :5156.0    Max.   :14924       
##                                                                       
##  Grand_Total_Hispanic Grand_Total_Native_Hawaiian_PI Grand_Total_White
##  Min.   :  12.0       Min.   :  0.00                 Min.   :   46    
##  1st Qu.: 128.2       1st Qu.:  2.00                 1st Qu.: 1092    
##  Median : 219.0       Median :  5.50                 Median : 2414    
##  Mean   : 701.5       Mean   : 20.75                 Mean   : 3992    
##  3rd Qu.: 691.5       3rd Qu.: 16.75                 3rd Qu.: 4658    
##  Max.   :5732.0       Max.   :356.00                 Max.   :19921    
##                                                                       
##  Grand_Total_Two_More Grand_Total_Race_Unknown
##  Min.   :   0.00      Min.   :   1.0          
##  1st Qu.:  63.25      1st Qu.:  54.0          
##  Median : 172.00      Median :  94.0          
##  Mean   : 296.75      Mean   : 347.7          
##  3rd Qu.: 441.25      3rd Qu.: 244.5          
##  Max.   :1873.00      Max.   :4617.0          
##                                               
##  Grand_Total_Nonresident_Alien
##  Min.   :   7.0               
##  1st Qu.:  34.0               
##  Median : 151.0               
##  Mean   : 470.9               
##  3rd Qu.: 331.0               
##  Max.   :4531.0               
## 
The Structure of our Dataframe
str(IPEDS)
## Classes 'tbl_df', 'tbl' and 'data.frame':    40 obs. of  21 variables:
##  $ UnitID                        : num  161688 161767 161864 162007 405872 ...
##  $ Inst_Name                     : chr  "Allegany College of Maryland" "Anne Arundel Community College" "Baltimore City Community College" "Bowie State University" ...
##  $ Inst_Sector                   : chr  "Public, 2-year" "Public, 2-year" "Public, 2-year" "Public, 4-year or above" ...
##  $ Count_Doctor_Deg              : num  NA NA NA 10 NA NA NA NA 0 NA ...
##  $ Count_Masters_Deg             : num  NA NA NA 337 NA NA NA NA 77 NA ...
##  $ Count_Bachelors_Deg           : num  NA NA NA 832 NA NA NA NA 464 NA ...
##  $ Count_Assoc_Deg               : num  438 1717 423 0 611 ...
##  $ Count_PostBac_PostMasters_Deg : num  NA NA NA 62 NA NA NA NA 0 NA ...
##  $ Count_Cert                    : num  91 218 37 0 20 5 14 684 0 70 ...
##  $ Count_Cert_Less_Year          : num  74 408 69 0 9 60 53 100 0 116 ...
##  $ Grand_Total_All_Students      : num  3091 14689 4726 5430 3542 ...
##  $ Grand_Total_Female            : num  2010 8721 3222 3414 2159 ...
##  $ Grand_Total_Amer_Ind_AK_Native: num  6 60 7 5 9 11 26 35 1 23 ...
##  $ Grand_Total_Asian             : num  8 558 123 75 57 35 27 243 12 286 ...
##  $ Grand_Total_Black_AA          : num  353 2472 3634 4432 145 ...
##  $ Grand_Total_Hispanic          : num  45 972 134 155 139 129 92 489 62 662 ...
##  $ Grand_Total_Native_Hawaiian_PI: num  3 42 6 8 1 0 3 32 2 7 ...
##  $ Grand_Total_White             : num  2557 8821 353 199 3039 ...
##  $ Grand_Total_Two_More          : num  58 541 86 184 79 94 45 448 52 244 ...
##  $ Grand_Total_Race_Unknown      : num  27 1072 92 110 66 ...
##  $ Grand_Total_Nonresident_Alien : num  34 151 291 262 7 14 20 34 370 36 ...

What Do We Do With NAs?

# Structure of calling specific columns in R: Data_frame_Name$Variable_Name

# Changing NAs to 0s for a specific column
IPEDS$Count_Doctor_Deg[is.na(IPEDS$Count_Doctor_Deg)] <- 0

# Changing NAs to 0s for the entire dataframe
IPEDS[is.na(IPEDS)] <- 0
Adding Fields
IPEDS$Grand_Total_Male <- IPEDS$Grand_Total_All_Students - IPEDS$Grand_Total_Female
colnames(IPEDS)
##  [1] "UnitID"                         "Inst_Name"                     
##  [3] "Inst_Sector"                    "Count_Doctor_Deg"              
##  [5] "Count_Masters_Deg"              "Count_Bachelors_Deg"           
##  [7] "Count_Assoc_Deg"                "Count_PostBac_PostMasters_Deg" 
##  [9] "Count_Cert"                     "Count_Cert_Less_Year"          
## [11] "Grand_Total_All_Students"       "Grand_Total_Female"            
## [13] "Grand_Total_Amer_Ind_AK_Native" "Grand_Total_Asian"             
## [15] "Grand_Total_Black_AA"           "Grand_Total_Hispanic"          
## [17] "Grand_Total_Native_Hawaiian_PI" "Grand_Total_White"             
## [19] "Grand_Total_Two_More"           "Grand_Total_Race_Unknown"      
## [21] "Grand_Total_Nonresident_Alien"  "Grand_Total_Male"
Rearranging Fields By Field Names
#Moving the new variable created, Grand_Total_Male in between "Grand_Total_All_Students" & "Grand_Total_Female"
IPEDS <- IPEDS[c("UnitID", "Inst_Name", "Inst_Sector", "Count_Doctor_Deg", "Count_Masters_Deg", "Count_Bachelors_Deg", "Count_Assoc_Deg", "Count_PostBac_PostMasters_Deg", "Count_Cert", "Count_Cert_Less_Year", "Grand_Total_All_Students", "Grand_Total_Male", "Grand_Total_Female", "Grand_Total_Amer_Ind_AK_Native", "Grand_Total_Asian", "Grand_Total_Black_AA", "Grand_Total_Hispanic", "Grand_Total_Native_Hawaiian_PI", "Grand_Total_White", "Grand_Total_Two_More", "Grand_Total_Race_Unknown", "Grand_Total_Nonresident_Alien")]

Visualizations

Comparison–Column Chart

vis_column_chart <-ggplot(IPEDS, aes(Inst_Name, Grand_Total_All_Students)) + geom_bar(stat = "identity")
vis_column_chart

Tweaking the Column Chart Part 1
IPEDS_large_pops <- subset(IPEDS, Grand_Total_All_Students > 10000)

vis_column_chart_v2 <-ggplot(IPEDS_large_pops, aes(Inst_Name, Grand_Total_All_Students)) + geom_bar(stat = "identity")
vis_column_chart_v2

Tweaking the Column Chart Part 2
vis_column_chart_horiz <- vis_column_chart_v2 + coord_flip()
vis_column_chart_horiz

Distribution–Histogram

vis_histogram <- ggplot(IPEDS, aes(Grand_Total_All_Students)) + geom_histogram(bins = 13) 
vis_histogram

Tweaking the Histogram Part 1-Adding a Title
vis_histogram_w_title <- vis_histogram + labs(title="Counts of Insitutions by Total Students")   # Adds a Title to the graphic 
vis_histogram_w_title

Tweaking the Histogram Part 2-Centering the Title
vis_histogram_w_centered_title <- vis_histogram_w_title + theme(plot.title = element_text(hjust = 0.5)) # Centers the Title
vis_histogram_w_centered_title

Tweaking the Histogram Part 3-Editing the Axis Names
vis_histogram_w_centered_title_edited_axes <- vis_histogram_w_centered_title + labs(x="Total Students",y="Count") # Changes axis names
vis_histogram_w_centered_title_edited_axes

Tweaking the Histogram Part 4-Editing the Y-Axis Limits
vis_histogram_w_edited_y_axis <- vis_histogram_w_centered_title_edited_axes + scale_y_continuous(breaks = seq(0, 10, by=2), limits=c(0,10)) # sets the y-axis from 0 to 10 in 2 increments
vis_histogram_w_edited_y_axis

Association–Two-Variable Scatter Plot

vis_scatter <- ggplot(IPEDS, aes(x=Grand_Total_Male, y=Grand_Total_Female)) + geom_point() + labs(title="Total Students by Gender") + theme(plot.title = element_text(hjust = 0.5)) + labs(x="Total Males",y="Total Females") + scale_y_continuous(breaks = seq(0, 30000, by=5000), limits=c(0,30000)) + scale_x_continuous(breaks = seq(0, 30000, by=5000), limits=c(0,30000))
vis_scatter