library(tidyverse)
library(DT)
setwd("C:/Users/MT/Desktop/R Coding/STA9750-2024-FALL")
<- readxl::read_xlsx("2022_fare_revenue.xlsx") |>
FARES select(-`State/Parent NTD ID`,
-`Reporter Type`,
-`Reporting Module`,
-`TOS`,
-`Passenger Paid Fares`,
-`Organization Paid Fares`) |>
filter(`Expense Type` == "Funds Earned During Period") |>
select(-`Expense Type`) |>
group_by(`NTD ID`, # Sum over different `TOS` for the same `Mode`
`Agency Name`, # These are direct operated and sub-contracted
`Mode`) |> # of the same transit modality
# Not a big effect in most munis (significant DO
# tends to get rid of sub-contractors), but we'll sum
# to unify different passenger experiences
summarize(`Total Fares` = sum(`Total Fares`)) |>
ungroup()
<- readr::read_csv("2022_expenses.csv") |>
EXPENSES select(`NTD ID`,
`Agency`,
`Total`,
`Mode`) |>
mutate(`NTD ID` = as.integer(`NTD ID`)) |>
rename(Expenses = Total) |>
group_by(`NTD ID`, `Mode`) |>
summarize(Expenses = sum(Expenses)) |>
ungroup()
<- inner_join(FARES, EXPENSES, join_by(`NTD ID`, `Mode`))
FINANCIALS
<- readxl::read_xlsx("ridership.xlsx", sheet = "UPT") |>
TRIPS filter(`Mode/Type of Service Status` == "Active") |>
select(-`Legacy NTD ID`,
-`Reporter Type`,
-`Mode/Type of Service Status`,
-`UACE CD`,
-`TOS`) |>
pivot_longer(-c(`NTD ID`:`3 Mode`),
names_to="month",
values_to="UPT") |>
drop_na() |>
mutate(month=my(month)) # Parse _m_onth _y_ear date specs
<- readxl::read_xlsx("ridership.xlsx", sheet="VRM") |>
MILES filter(`Mode/Type of Service Status` == "Active") |>
select(-`Legacy NTD ID`,
-`Reporter Type`,
-`Mode/Type of Service Status`,
-`UACE CD`,
-`TOS`) |>
pivot_longer(-c(`NTD ID`:`3 Mode`),
names_to="month",
values_to="VRM") |>
drop_na() |>
group_by(`NTD ID`, `Agency`, `UZA Name`,
`Mode`, `3 Mode`, month) |>
summarize(VRM = sum(VRM)) |>
ungroup() |>
mutate(month=my(month)) # Parse _m_onth _y_ear date specs
<- inner_join(TRIPS, MILES) |>
USAGE mutate(`NTD ID` = as.integer(`NTD ID`))
sample_n(USAGE, 1000) |>
mutate(month=as.character(month)) |>
::datatable() DT
Mini-Project #01: Fiscal Characteristics of Major US Public Transit Systems
Introduction
This is a small project in which I explore data from the National Transit Database with the objective of finding facts and insights from the data. Instructions and further details provided by my professor Mr. Michael Weylandt, Ph.D can be found here. We will be using data such as farebox revenues, total number of trips, total number of vehicle miles traveled, and total revenues and expenses by source.
Preparing the Data
To start with the project, I used the following code to download the data. Here we can see some tables being created with specific columns, then these new tables are joined.
Task 1: Creating Syntatic Names
After downloading the data, the first task is to create a synthetic name. The column UZA Name
was renamed as metro_area
:
<- USAGE |>
USAGE rename(metro_area = `UZA Name`)
Task 2: Recoding the Mode column
Mode column was recoded for easier understanding of data. Details obtained from here
<- USAGE |>
USAGE mutate(Mode = case_when(
== "AR" ~ "Alaska Railroad",
Mode == "CB" ~ "Commuter Bus",
Mode == "CC" ~ "Cable Car",
Mode == "CR" ~ "Commuter Rail",
Mode == "DR" ~ "Demand Response",
Mode == "FB" ~ "Ferryboat",
Mode == "HR" ~ "Heavy Rail",
Mode == "IP" ~ "Inclined Plane",
Mode == "LR" ~ "Light Rail",
Mode == "MB" ~ "Bus",
Mode == "MG" ~ "Monorail Automated Guideway",
Mode == "PB" ~ "Publico",
Mode == "RB" ~ "Bus Rapid Transit",
Mode == "SR" ~ "Streetcar Rail",
Mode == "TB" ~ "Trolleybus",
Mode == "TR" ~ "Aerial Tramway",
Mode == "VP" ~ "Vanpool",
Mode == "YR" ~ "Hybrid Rail",
Mode TRUE ~ "Unknown"
))
Now we can see a table with cleaner data:
if(!require("DT")) install.packages("DT")
library(DT)
<- USAGE |>
USAGE select(-`3 Mode`)
sample_n(USAGE, 1000) |>
mutate(month = as.character(month)) |>
::datatable() DT
Task 3: Answering Instructor Specified Questions with dplyr
Q1. What transit agency had the most total VRM in our data set? The transit agency with the most total VRM (Vehicle Revenue Miles) is MTA New York City Transit with around 69 billion in total VRM.
|>
USAGE group_by(Agency) |>
summarise(UPT_Total = sum(UPT)) |>
arrange(desc(UPT_Total)) |>
head(1) |>
ungroup()
# A tibble: 1 × 2
Agency UPT_Total
<chr> <dbl>
1 MTA New York City Transit 69101730780
Q2. What transit mode had the most total VRM in our data set? The transit mode with the most total VRM in our data set is bus with a total UPT (Unlinked Passenger Trips) of around 49 billion
|>
USAGE group_by(Mode) |>
summarise(VRM_Total = sum(VRM)) |>
arrange(desc(VRM_Total)) |>
head(1) |>
ungroup()
# A tibble: 1 × 2
Mode VRM_Total
<chr> <dbl>
1 Bus 49444494088
# The data shows bus transportation had the most vehicle revenue miles "VRM"
# which is "The miles that vehicles are scheduled to or actually travel while in revenue service"
# https://www.transit.dot.gov/ntd/national-transit-database-ntd-glossary#V
Q3. How many trips were taken on the NYC Subway (Heavy Rail) in May 2024? A total of 186,478,364 trips were taken on the NYC subway
colnames(USAGE)
[1] "NTD ID" "Agency" "metro_area" "Mode" "month"
[6] "UPT" "VRM"
<- USAGE |>
nyc_filter_q3 filter(metro_area == "New York--Jersey City--Newark, NY--NJ", Mode == "Heavy Rail", month == "2024-05-01") |>
summarise(May_2024_Trips = sum(UPT, na.rm = TRUE))
print(nyc_filter_q3)
# A tibble: 1 × 1
May_2024_Trips
<dbl>
1 186478364
Q4. What mode of transport had the longest average trip in May 2024? Note: This question can’t be answered with the current data
Q5. How much did NYC subway ridership fall between April 2019 and April 2020? NYC subway ridership fell by approximately 91 during the dates indicated. This could be atributed to the Corona Virus pandemic of 2020.
<- USAGE |>
nyc_filter_q5_part1 filter(metro_area == "New York--Jersey City--Newark, NY--NJ", Mode == "Heavy Rail", month == "2019-04-01") |>
summarise(nyc_sub_2019 = sum(UPT, na.rm = TRUE))
<- USAGE |>
nyc_filter_q5_part2 filter(metro_area == "New York--Jersey City--Newark, NY--NJ", Mode == "Heavy Rail", month == "2020-04-01") |>
summarise(nyc_sub_2020 = sum(UPT, na.rm = TRUE))
=((nyc_filter_q5_part2-nyc_filter_q5_part1)/(nyc_filter_q5_part1))*100
nyc_filter_q5_part3print(nyc_filter_q5_part3)
nyc_sub_2020
1 -91.39649
Task 4: Explore and Analyze: Find three more interesting transit facts in this data other than those above.
Q4.1 Which Agency has the most VRM per UPT? I chose this question because I believe this shows how efficiently an agency is using its resources. A lower Vehicle Revenue Miles to Unlinked Passenger Trips ratio would show that an Agency is able to maximize the usage of its vehicles in relation to the number of passanger trips. In this case the most efficient based on the previously mentioned definition would be New York City Department of Transportation with a ratio of .00942. However, this needs deeper analysis to determine its implications and flaws
<- USAGE |>
vrm_per_upt group_by(Agency) |>
summarise(UPT_Total = sum(UPT), VRM_Total = sum(VRM), VRM_per_UPT_ratio = (VRM_Total/UPT_Total)) |>
ungroup() |>
arrange(VRM_per_UPT_ratio) |>
head(5)
print(vrm_per_upt)
# A tibble: 5 × 4
Agency UPT_Total VRM_Total VRM_per_UPT_ratio
<chr> <dbl> <dbl> <dbl>
1 New York City Department of Transportat… 440538576 4150573 0.00942
2 Plaquemines Port, Harbor and Terminal D… 1397902 37290 0.0267
3 Washington State Ferries 502394973 20230767 0.0403
4 Kansas City, City of Missouri 14171285 1074129 0.0758
5 City of Portland 56396362 4562513 0.0809
Q4.2 What is the most used mode per metro area? The most used mode per metro area is the bus
<- USAGE |>
most_used_mode_per_metro_area group_by(metro_area, Mode) |>
summarise(total_upt = sum(UPT, na.rm = TRUE), .groups = "drop") |>
group_by(metro_area) |>
slice(which.max(total_upt)) |>
select(metro_area, Mode, total_upt)
print(most_used_mode_per_metro_area)
# A tibble: 376 × 3
# Groups: metro_area [376]
metro_area Mode total_upt
<chr> <chr> <dbl>
1 Abilene, TX Bus 4811813
2 Akron, OH Bus 133397482
3 Albany, GA Bus 15828372
4 Albany--Schenectady, NY Bus 312961295
5 Albuquerque, NM Bus 200432456
6 Alexandria, LA Bus 6168051
7 Allentown--Bethlehem, PA--NJ Bus 101405562
8 Altoona, PA Bus 13053250
9 Amarillo, TX Bus 4408831
10 Ames, IA Bus 109854287
# ℹ 366 more rows
Q4.3 What are the top metro areas by mode? New York Metro area is the top area by what I consider to be the most important modes of transportation: Heavy Rail, Bus, and Commuter Rail
<- USAGE |>
top_metro_areas_by_mode group_by(Mode, metro_area) |>
summarise(total_upt = sum(UPT, na.rm = TRUE)) |>
group_by(Mode) |>
slice(which.max(total_upt)) |>
arrange(desc(total_upt))
print(top_metro_areas_by_mode)
# A tibble: 18 × 3
# Groups: Mode [18]
Mode metro_area total_upt
<chr> <chr> <dbl>
1 Heavy Rail New York--Jersey City--Newark, NY--NJ 53450544938
2 Bus New York--Jersey City--Newark, NY--NJ 23394737094
3 Commuter Rail New York--Jersey City--Newark, NY--NJ 5430746046
4 Light Rail Boston, MA--NH 1372586910
5 Trolleybus San Francisco--Oakland, CA 1345222391
6 Ferryboat New York--Jersey City--Newark, NY--NJ 556020462
7 Publico San Juan, PR 516329565
8 Bus Rapid Transit New York--Jersey City--Newark, NY--NJ 315432467
9 Commuter Bus New York--Jersey City--Newark, NY--NJ 286645521
10 Streetcar Rail Philadelphia, PA--NJ--DE--MD 263034317
11 Demand Response New York--Jersey City--Newark, NY--NJ 146574222
12 Cable Car San Francisco--Oakland, CA 138759762
13 Monorail Automated Guideway Miami--Fort Lauderdale, FL 101162479
14 Vanpool Seattle--Tacoma, WA 87386804
15 Hybrid Rail New York--Jersey City--Newark, NY--NJ 30022908
16 Aerial Tramway Portland, OR--WA 16258014
17 Inclined Plane Pittsburgh, PA 13141618
18 Alaska Railroad Anchorage, AK 2938715
Task 5: Table Summarization
In this part of the project, I created a new table from USAGE that has annual total (sum) UPT and VRM for 2022
<- USAGE |>
USAGE_2022_ANNUAL filter(year(month) == "2022") |>
group_by(`NTD ID`, Agency, metro_area, Mode) |>
summarise(UPT_Total = sum(UPT, na.rm = TRUE), VRM_Total = sum(VRM, na.rm = TRUE)) |>
ungroup()
<- FINANCIALS |>
FINANCIALS mutate(Mode = case_when(
== "AR" ~ "Alaska Railroad",
Mode == "CB" ~ "Commuter Bus",
Mode == "CC" ~ "Cable Car",
Mode == "CR" ~ "Commuter Rail",
Mode == "DR" ~ "Demand Response",
Mode == "FB" ~ "Ferryboat",
Mode == "HR" ~ "Heavy Rail",
Mode == "IP" ~ "Inclined Plane",
Mode == "LR" ~ "Light Rail",
Mode == "MB" ~ "Bus",
Mode == "MG" ~ "Monorail Automated Guideway",
Mode == "PB" ~ "Publico",
Mode == "RB" ~ "Bus Rapid Transit",
Mode == "SR" ~ "Streetcar Rail",
Mode == "TB" ~ "Trolleybus",
Mode == "TR" ~ "Aerial Tramway",
Mode == "VP" ~ "Vanpool",
Mode == "YR" ~ "Hybrid Rail",
Mode TRUE ~ "Unknown"
))
<- left_join(USAGE_2022_ANNUAL, FINANCIALS,
USAGE_AND_FINANCIALS join_by(`NTD ID`, "Mode")) |>
drop_na()
colnames(USAGE_2022_ANNUAL)
[1] "NTD ID" "Agency" "metro_area" "Mode" "UPT_Total"
[6] "VRM_Total"
sample_n(USAGE_2022_ANNUAL, 1141) |>
::datatable() DT
Task 6: Farebox Recovery Among All Systems
Note I did not restrict data to major transit systems because I was interested in seeing the results despite the size of the transit system
Q1. Which transit system (agency and mode) had the most UPT in 2022? The transit system with the most UPT in 2022 was the MTA New York City Transit
|>
USAGE_2022_ANNUAL group_by(Agency, Mode) |>
arrange(desc(UPT_Total)) |>
head(5) |>
ungroup()
# A tibble: 5 × 6
`NTD ID` Agency metro_area Mode UPT_Total VRM_Total
<int> <chr> <chr> <chr> <dbl> <dbl>
1 20008 MTA New York City Transit New York-… Heav… 1.79e9 338199451
2 20008 MTA New York City Transit New York-… Bus 4.59e8 82638609
3 90154 Los Angeles County Metropolitan… Los Angel… Bus 1.94e8 124306796
4 50066 Chicago Transit Authority Chicago, … Bus 1.40e8 44199272
5 20080 New Jersey Transit Corporation New York-… Bus 1.13e8 161834490
Q2. Which transit system (agency and mode) had the highest farebox recovery, defined as the highest ratio of Total Fares to Expenses?
The Agency Transit Authority of Central Kentucky with the mode of Vanpool had the highest farebox recovery. This indicates that this agency was able to obtain more revenue relative to its expenses in comparison to other Agencies. It its interesting to see that Vanpool occupies the second and third place in this query. This might indicate that this is Mode of transportation is financially efficient
<- USAGE_AND_FINANCIALS |>
highest_farebox_recovery group_by(Agency, Mode) |>
summarise(ratio = (sum(`Total Fares`, na.rm = TRUE)/sum(Expenses, na.rm = TRUE))) |>
ungroup() |>
arrange(desc(ratio)) |>
head(5)
print(highest_farebox_recovery)
# A tibble: 5 × 3
Agency Mode ratio
<chr> <chr> <dbl>
1 Transit Authority of Central Kentucky Vanpool 2.38
2 County of Miami-Dade Vanpool 1.67
3 Yuma County Intergovernmental Public Transportation Authority Vanpool 1.47
4 Port Imperial Ferry Corporation Ferryboat 1.43
5 Hyannis Harbor Tours, Inc. Ferryboat 1.41
Q3. Which transit system (agency and mode) has the lowest expenses per UPT?
North Carolina State University’s Bus transportation had the lowest expenses per UPT. This seems to be a transit system focusing on transportation for a university, so it might be that the low expenses derive from the relative short range of the service provided
<- USAGE_AND_FINANCIALS |>
lowest_expenses_per_UPT group_by(Agency, Mode) |>
summarise(ratio = (sum(Expenses, na.rm = TRUE)/sum(UPT_Total, na.rm = TRUE)),.groups = "drop") |>
arrange(ratio) |>
head(5)
print(lowest_expenses_per_UPT)
# A tibble: 5 × 3
Agency Mode ratio
<chr> <chr> <dbl>
1 North Carolina State University Bus 1.18
2 Anaheim Transportation Network Bus 1.28
3 Valley Metro Rail, Inc. Streetcar Rail 1.49
4 University of Iowa Bus 1.54
5 Chatham Area Transit Authority Ferryboat 1.60
Q4. Which transit system (agency and mode) has the highest total fares per UPT?
The Altoona Metro Transit Demand Response transportation had the highest total fares per UPT
<- USAGE_AND_FINANCIALS |>
highest_total_fares_per_UPT group_by(Agency, Mode) |>
summarise(ratio = (sum(`Total Fares`, na.rm = TRUE)/sum(UPT_Total, na.rm = TRUE)),.groups = "drop") |>
arrange(desc(ratio)) |>
head(5)
print(highest_total_fares_per_UPT)
# A tibble: 5 × 3
Agency Mode ratio
<chr> <chr> <dbl>
1 Altoona Metro Transit Demand Response 660.
2 Alaska Railroad Corporation Alaska Railroad 153.
3 Bay State LLC Ferryboat 65.0
4 Central Pennsylvania Transportation Authority Demand Response 50.2
5 Hampton Jitney, Inc. Commuter Bus 41.3
Q5. Which transit system (agency and mode) has the lowest expenses per VRM?
New Mexico Department of Transportation Vanpool mode had the lowest expense per VRM. That same mode is in the next 4 spots of our results. This indicates that Vanpool could be transportation mode that does not require or demands a lot of expenses.
<- USAGE_AND_FINANCIALS |>
lowest_expenses_per_VRM group_by(Agency, Mode) |>
summarise(ratio = (sum(Expenses, na.rm = TRUE)/sum(VRM_Total, na.rm = TRUE)),.groups = "drop") |>
arrange(ratio) |>
head(5)
print(lowest_expenses_per_VRM)
# A tibble: 5 × 3
Agency Mode ratio
<chr> <chr> <dbl>
1 New Mexico Department of Transportation Vanpool 0.337
2 VIA Metropolitan Transit Vanpool 0.370
3 County of Miami-Dade Vanpool 0.386
4 County of Volusia Vanpool 0.393
5 Corpus Christi Regional Transportation Authority Vanpool 0.431
Q6. Which transit system (agency and mode) has the highest total fares per VRM?
Chicago Water Taxi (Wendella) Ferryboat transportation had the highest total fares per VRM. This might reflect the fact that expenses and mainteinance for this mode of transportation is higher
<- USAGE_AND_FINANCIALS |>
highest_fares_per_VRM group_by(Agency, Mode) |>
summarise(ratio = (sum(`Total Fares`, na.rm = TRUE)/sum(VRM_Total, na.rm = TRUE)),.groups = "drop") |>
arrange(desc(ratio)) |>
head(5)
print(highest_fares_per_VRM)
# A tibble: 5 × 3
Agency Mode ratio
<chr> <chr> <dbl>
1 Chicago Water Taxi (Wendella) Ferryboat 237.
2 Altoona Metro Transit Demand Response 229.
3 Jacksonville Transportation Authority Ferryboat 158.
4 Chattanooga Area Regional Transportation Authority Inclined Plane 149.
5 Hyannis Harbor Tours, Inc. Ferryboat 138.
Conclusion
Farebox recovery rate for the Transit Authority of Central Kentucky, and more specifically the Vanpool mode seems to be great. There should be a deeper investigation into why this is, and how this transportation mode can be escalated to serve more people, or how Vanpool transportation mode’s characteristics can be leveraged for other transportation modes to achieve such recovery rates