Loading in prescription data sets and health board names from open nhs data. Loading census education data and cleaning names. Using skip function, so R only reads
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(here)
## here() starts at /Users/laurenchalmers/Library/Mobile Documents/com~apple~CloudDocs/Data-Science/B215224
july_2025_data <- read_csv("https://www.opendata.nhs.scot/dataset/84393984-14e9-4b0d-a797-b288db64d088/resource/f02161c1-15f6-4d9f-8be5-01ba613d4303/download/hb_pitc2025_07.csv") %>%
clean_names()
## Rows: 122280 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): HBT, BNFItemCode, BNFItemDescription, PrescribedType
## dbl (5): DMDCode, NumberOfPaidItems, PaidQuantity, GrossIngredientCost, Paid...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
health_board_names <- read_csv("https://www.opendata.nhs.scot/dataset/9f942fdb-e59e-44f5-b534-d6e17229cc7b/resource/652ff726-e676-4a20-abda-435b98dd7bdc/download/hb14_hb19.csv") %>%
clean_names()
## Rows: 18 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): HB, HBName, Country
## dbl (2): HBDateEnacted, HBDateArchived
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Joining prescription Dataset from July 2025, with health boards names. Selecting columns with health board names, the item prescribed and the quantity in which they were prescribed. Filtered using string detect to identify just antibiotic prescriptions, since they all end in ‘illin’.
Removing rows where there is no info about health board names
Health_board_data_2025 <- july_2025_data %>%
left_join(health_board_names, by = join_by(hbt == hb))
missing_values <- Health_board_data_2025%>%
filter(is.na(hb_name))
Wrangling with population data to compare with prescription health board data
population_data <- read_csv(here("data", "healthboardpopulationbyage.csv"), skip = 10) %>%
clean_names() %>%
rename(Spare = "x6",
hb_name = "health_board_area_2019",
hb_population = count) %>%
filter(age == "All people" & sex == "All people") %>%
select(hb_name, hb_population) %>%
mutate(hb_name = paste("NHS", hb_name))
## New names:
## • `` -> `...6`
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 3657 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Counting, Health Board Area 2019, Age, Sex
## dbl (1): Count
## lgl (1): ...6
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Joining Population with prescription data
population_2025_health_data <- population_data %>%
left_join(Health_board_data_2025, by = join_by(hb_name == hb_name))
Grouping data by tyope of prescription anf filtering prescriptions to only see antibiotic prescriptions. Totalling up number of antibiotics for each healthboard.
health_board_2025_population_antibiotics <- population_2025_health_data %>%
filter(str_detect(bnf_item_description, "CILLIN")) %>%
group_by(hb_name, bnf_item_description) %>%
summarise(paid_quantity = sum(paid_quantity))
## `summarise()` has grouped output by 'hb_name'. You can override using the
## `.groups` argument.
Joining with population data to determine number of antibiotics per person in each health board
antibiotics_2025_data <- health_board_2025_population_antibiotics %>%
filter(str_detect(bnf_item_description, "CILLIN")) %>%
summarise(paid_quantity = sum(paid_quantity)) %>%
left_join(population_data) %>%
mutate(antibiotics_per_person = paid_quantity/hb_population) %>%
arrange(desc(antibiotics_per_person))
## Joining with `by = join_by(hb_name)`
library(forcats)
p2025 <- antibiotics_2025_data %>%
mutate(hb_name = fct_reorder(hb_name, antibiotics_per_person)) %>% # reorder factor by antibiotics_per_person ascending
ggplot(aes(x = hb_name, y = antibiotics_per_person, colour = hb_name, fill = hb_name)) +
geom_col() +
labs(
x = "Health Board Name",
y = "Number of Antibiotics per person in July 2025",
title = "Number of Antibiotics Prescribed per person in July 2025"
)
population <- read_csv(here("data", "healthboardpopulationbyage.csv"), skip = 10) %>%
clean_names()
## New names:
## • `` -> `...6`
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 3657 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Counting, Health Board Area 2019, Age, Sex
## dbl (1): Count
## lgl (1): ...6
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Tidying up population data
population <- population %>%
rename(Spare = "x6",
hb_name = "health_board_area_2019",
hb_population = count) %>%
filter(age == "All people" & sex == "All people") %>%
select(hb_name, hb_population) %>%
mutate(hb_name = paste("NHS", hb_name))
Comparing with covid data
july_2020_data <- read_csv("https://www.opendata.nhs.scot/dataset/84393984-14e9-4b0d-a797-b288db64d088/resource/80b6ddb1-f09b-4d76-9927-da1e118b01ff/download/pitc202007.csv") %>%
clean_names()
## Rows: 1145451 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): HBT, BNFItemCode, BNFItemDescription, ClassOfPreparationCode
## dbl (5): GPPractice, NumberOfPaidItems, PaidQuantity, GrossIngredientCost, P...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Join with Health board names and population data
july_2020_data_health_boards <- july_2020_data %>%
left_join(health_board_names, by = join_by(hbt == hb))
july_2020_data_health_boards_population <- july_2020_data_health_boards %>%
left_join(population, by = join_by(hb_name == hb_name))
Organising data: Filtering to select only antibiotic prescriptions, totalling up these for each healthboard, and grouping by health board and type of antibiotic.
july_2020_data_health_boards_population <- july_2020_data_health_boards_population %>%
filter(str_detect(bnf_item_description, "CILLIN")) %>%
group_by(hb_name, bnf_item_description) %>%
summarise(paid_quantity = sum(paid_quantity)) %>%
filter(!is.na(hb_name)) %>%
left_join(population)
## `summarise()` has grouped output by 'hb_name'. You can override using the
## `.groups` argument.
## Joining with `by = join_by(hb_name)`
Calculating the number of antibiotics administered per person in each healthboard in july 2020
antibiotics_per_person_data_2020 <-july_2020_data_health_boards_population %>%
filter(str_detect(bnf_item_description, "CILLIN")) %>%
summarise(paid_quantity = sum(paid_quantity)) %>%
left_join(population) %>%
mutate(antibiotics_per_person = paid_quantity/hb_population)
## Joining with `by = join_by(hb_name)`
PLotting this data
p2020 <- antibiotics_per_person_data_2020 %>%
mutate(hb_name = fct_reorder(hb_name, antibiotics_per_person)) %>%
ggplot(aes(x = hb_name, y = antibiotics_per_person, fill = hb_name, colour = hb_name)) +
geom_col() +
labs(x = "Health Board Name", y = "Number of Antibiotics per person in July 2020", title = "Number of Antibiotics Prescribed per person in July 2020")
comparing post and during covid with pre covid data
july_2016_data <- read_csv("https://www.opendata.nhs.scot/dataset/84393984-14e9-4b0d-a797-b288db64d088/resource/c12c905b-3a2d-4b7a-b5d2-8c359b786beb/download/pitc201607.csv") %>%
clean_names()
## Rows: 1131982 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): HBT2014, BNFItemCode, BNFItemDescription
## dbl (5): GPPractice, NumberOfPaidItems, PaidQuantity, GrossIngredientCost, P...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
joining with health board and population data
july_2016_data_health_boards <- july_2016_data %>%
left_join(health_board_names, by = join_by(hbt2014 == hb))
july_2016_data_health_boards_population <- july_2016_data_health_boards %>%
left_join(population, by = join_by(hb_name == hb_name))
Organising the data
july_2016_data_health_boards_population <- july_2016_data_health_boards_population %>%
filter(str_detect(bnf_item_description, "CILLIN")) %>%
group_by(hb_name, bnf_item_description) %>%
summarise(paid_quantity = sum(paid_quantity)) %>%
filter(!is.na(hb_name)) %>%
left_join(population)
## `summarise()` has grouped output by 'hb_name'. You can override using the
## `.groups` argument.
## Joining with `by = join_by(hb_name)`
calculating the numbers of ab per person in each health board in 2016
antibiotics_per_person_data_2016 <-july_2016_data_health_boards_population %>%
filter(str_detect(bnf_item_description, "CILLIN")) %>%
summarise(paid_quantity = sum(paid_quantity)) %>%
left_join(population) %>%
mutate(antibiotics_per_person = paid_quantity/hb_population)
## Joining with `by = join_by(hb_name)`
Plotting this data
p2016 <- antibiotics_per_person_data_2016 %>%
mutate(hb_name = fct_reorder(hb_name, antibiotics_per_person)) %>%
ggplot(aes(x = hb_name, y = antibiotics_per_person, fill = hb_name, colour = hb_name)) +
geom_col() +
labs(x = "Health Board Name", y = "Number of Antibiotics per person in July 2016", title = "Number of Antibiotics Prescribed per person in July 2016")
Joining all years july data for comparison
Joined_prescription_data <- antibiotics_per_person_data_2016 %>%
left_join(antibiotics_per_person_data_2020)
## Joining with `by = join_by(hb_name, paid_quantity, hb_population,
## antibiotics_per_person)`
Joined_prescription_data_all <- Joined_prescription_data %>%
left_join(antibiotics_2025_data)
## Joining with `by = join_by(hb_name, paid_quantity, hb_population,
## antibiotics_per_person)`
#is there a way to join this so values are not averaged, maybe ill try plotting and facet wrapping to see what this does
Plotting all data
Joined_prescription_data_all %>%
ggplot(aes(x = hb_name, y = antibiotics_per_person), fill = hb_name) +
geom_col() +
labs(x = "Health Board Name", y = "Number of Antibiotics per person", title = "Number of Antibiotics Prescribed per person in July 2016, July 2020, July 2025")
## Warning in fortify(data, ...): Arguments in `...` must be used.
## ✖ Problematic argument:
## • fill = hb_name
## ℹ Did you misspell an argument name?
Investigating most common types of ab prescribed in each hb across pre, during and post covid
july_2016_ab_type <- july_2016_data %>%
left_join(health_board_names, by = join_by(hbt2014 == hb)) %>%
select(paid_quantity, hb_name, bnf_item_description, paid_date_month) %>%
left_join(population, by = join_by(hb_name == hb_name))
Getting rid of dose data, so just shows different types of ab
library(dplyr)
library(stringr)
july_2016_ab_type <- july_2016_ab_type %>%
filter(str_detect(bnf_item_description, "CILLIN")) %>%
mutate(bnf_item_description = case_when(str_starts(bnf_item_description, "AMOX") ~ "AMOXICILLIN", str_starts(bnf_item_description, "BENZYL") ~ "BENZYLPENICILLN", str_starts(bnf_item_description, "PHENO") ~ "PHENOXYMETHYLPENICILLIN" , str_starts(bnf_item_description, "PIV") ~ "PIVMECILLINAM", str_starts(bnf_item_description, "AMPI") ~ "AMPICILLIN", TRUE ~ bnf_item_description)) %>%
group_by(bnf_item_description, hb_name, paid_date_month) %>%
summarise(paid_quantity = sum(paid_quantity), .groups = "drop")
july_2016_ab_type_top <- july_2016_ab_type %>%
arrange(desc(paid_quantity))
type data, doesnt take into account amount per person though, but probs not that relevant here
july_2016_ab_type_top %>%
ggplot(aes(x = bnf_item_description, y = paid_quantity, fill = bnf_item_description, colour = bnf_item_description)) +
geom_col() +
facet_wrap(~hb_name) +
labs(x = "Type of Antibiotic", y = "Number of Antibiotics Prescribed", title = "Most common Antibiotics Prescribed in July 2016")
Type data for 2020
july_2020_ab_type <- july_2020_data %>%
left_join(health_board_names, by = join_by(hbt == hb)) %>%
select(paid_quantity, hb_name, bnf_item_description, paid_date_month) %>%
left_join(population, by = join_by(hb_name == hb_name))
july_2020_ab_type <- july_2020_ab_type %>%
filter(str_detect(bnf_item_description, "CILLIN")) %>% mutate(bnf_item_description = case_when(str_starts(bnf_item_description, "AMOX") ~ "AMOXICILLIN", str_starts(bnf_item_description, "BENZYL") ~ "BENZYLPENICILLN", str_starts(bnf_item_description, "PHENO") ~ "PHENOXYMETHYLPENICILLIN" , str_starts(bnf_item_description, "PIV") ~ "PIVMECILLINAM", str_starts(bnf_item_description, "AMPI") ~ "AMPICILLIN", str_starts(bnf_item_description, "PIP") ~ "PIPERACILLIN", TRUE ~ bnf_item_description)) %>%
group_by(bnf_item_description, hb_name, paid_date_month) %>%
filter(!is.na(hb_name)) %>%
summarise(paid_quantity = sum(paid_quantity))
## `summarise()` has grouped output by 'bnf_item_description', 'hb_name'. You can
## override using the `.groups` argument.
july_2020_ab_type_top_10 <- july_2020_ab_type %>%
arrange(desc(paid_quantity)) %>%
slice_max(paid_quantity, n = 10)
type data, doesnt take into account amount per person though, but probs not that relevant here
july_2020_ab_type_top_10 %>%
ggplot(aes(x = bnf_item_description, y = paid_quantity, fill = bnf_item_description, colour = bnf_item_description)) +
geom_col() +
facet_wrap(~hb_name) +
labs(x = "Type of Antibiotic", y = "Number of Antibiotics Prescribed", title = "Most common Antibiotics Prescribed in July 2020")
Type data for 2025
july_2025_ab_type <- july_2025_data %>%
left_join(health_board_names, by = join_by(hbt == hb)) %>%
select(paid_quantity, hb_name, bnf_item_description, paid_date_month) %>%
left_join(population, by = join_by(hb_name == hb_name))
july_2025_ab_type <- july_2025_ab_type %>%
filter(str_detect(bnf_item_description, "CILLIN")) %>% mutate(bnf_item_description = case_when(str_starts(bnf_item_description, "AMOX") ~ "AMOXICILLIN", str_starts(bnf_item_description, "BENZYL") ~ "BENZYLPENICILLN", str_starts(bnf_item_description, "PHENO") ~ "PHENOXYMETHYLPENICILLIN" , str_starts(bnf_item_description, "PIV") ~ "PIVMECILLINAM", str_starts(bnf_item_description, "AMPI") ~ "AMPICILLIN", str_starts(bnf_item_description, "FLU") ~ "FLUCLOXACILLIN", TRUE ~ bnf_item_description)) %>%
group_by(bnf_item_description, hb_name, paid_date_month) %>%
filter(!is.na(hb_name)) %>%
summarise(paid_quantity = sum(paid_quantity))
## `summarise()` has grouped output by 'bnf_item_description', 'hb_name'. You can
## override using the `.groups` argument.
july_2025_ab_type_top_10 <- july_2025_ab_type %>%
arrange(desc(paid_quantity)) %>%
slice_max(paid_quantity, n = 10)
type data, doesnt take into account amount per person though, but probs not that relevant here
july_2025_ab_type_top_10 %>%
ggplot(aes(x = bnf_item_description, y = paid_quantity, fill = bnf_item_description)) +
geom_col() +
facet_wrap(~hb_name) +
labs(x = "Type of Antibiotic", y = "Number of Antibiotics Prescribed", title = "Most common Antibiotics Prescribed in July 2025")
Create a tibble of all data, and compare year against ab per person ?
All_years <- july_2020_data %>%
left_join(july_2016_data, by = join_by( hbt== hbt2014, bnf_item_description == bnf_item_description, paid_quantity == paid_quantity, paid_date_month == paid_date_month)) %>%
left_join(july_2025_data)
## Joining with `by = join_by(hbt, bnf_item_description, paid_quantity,
## paid_date_month)`
All_years_hb <- All_years %>%
left_join(health_board_names, by = join_by(hbt == hb))
All_years_filtered <- All_years_hb %>%
select(bnf_item_description, hb_name, paid_quantity, paid_date_month) %>%
filter(str_detect(bnf_item_description, "CILLIN")) %>%
group_by(hb_name, bnf_item_description, paid_date_month) %>%
summarise(paid_quantity = sum(paid_quantity)) %>%
arrange(desc(paid_quantity)) %>%
filter(!is.na (hb_name))
## `summarise()` has grouped output by 'hb_name', 'bnf_item_description'. You can
## override using the `.groups` argument.
All_years_filtered %>%
ggplot(aes(x = bnf_item_description, y = paid_quantity, fill = hb_name)) +
geom_col() +
facet_wrap(~paid_date_month)
Table containing data about antibiotic type, total number, number per person, across the 3 years and different healthboards need to ask whether i need to use old census data for 2016 and 2020, also need to ask if i should look at more months or just these months