1

I have the following tibble:

eu_df <- structure( list( nuts_code = c( "PT17", "PT17", "PT17", "PT17", "PT17", "PT17", "PT17", "PT17", "PT17", "PT17", "PT17", "PT1A", "PT1A", "PT1A", "PT1A", "PT1A", "PT1A", "PT1A", "PT1A", "PT1A", "PT1A", "PT1A", "PT1B", "PT1B", "PT1B", "PT1B", "PT1B", "PT1B", "PT1B", "PT1B", "PT1B", "PT1B", "PT1B" ), year = c( 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024 ), pop = c( 2815667, 2820766, 2829408, 2836906, 2849085, 2859422, 2884800, 2884695, 2883645, 2921564, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 2097742, 2126578, NA, NA, NA, NA, NA, NA, NA, NA, NA, 823822, 834599 ), medage = c( 41.9, 42.1, 42.4, 42.7, 42.9, 43.2, 43.5, 44, 44.5, 44.8, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 44.5, 44.7, NA, NA, NA, NA, NA, NA, NA, NA, NA, 45.5, 45.6 ), gdp = c( NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 77253.6, 90226.08, 102494.09, NA, NA, NA, NA, NA, NA, NA, NA, 14134.32, 15740.99, 17196, NA ), selfemp = c( NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 132.2, 143.35, NA, NA, NA, NA, NA, NA, NA, NA, NA, 43.5, 45.96, NA, NA ), entnum = c( NA, NA, NA, NA, NA, NA, NA, 387952, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 305387, 333104, NA, NA, NA, NA, NA, NA, NA, NA, NA, 82565, 90899, NA, NA ), area = c( 3015, 3015, 3015, 3015, 3015, 3015, 3015, 3015, 3015, 3015, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1390, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1625 ) ), row.names = c(NA, -33L), class = "data.frame" )
> eu_df

   nuts_code year     pop medage       gdp selfemp entnum area
1       PT17 2014 2815667   41.9        NA      NA     NA 3015
2       PT17 2015 2820766   42.1        NA      NA     NA 3015
3       PT17 2016 2829408   42.4        NA      NA     NA 3015
4       PT17 2017 2836906   42.7        NA      NA     NA 3015
5       PT17 2018 2849085   42.9        NA      NA     NA 3015
6       PT17 2019 2859422   43.2        NA      NA     NA 3015
7       PT17 2020 2884800   43.5        NA      NA     NA 3015
8       PT17 2021 2884695   44.0        NA      NA 387952 3015
9       PT17 2022 2883645   44.5        NA      NA     NA 3015
10      PT17 2023 2921564   44.8        NA      NA     NA 3015
11      PT17 2024      NA     NA        NA      NA     NA   NA
12      PT1A 2014      NA     NA        NA      NA     NA   NA
13      PT1A 2015      NA     NA        NA      NA     NA   NA
14      PT1A 2016      NA     NA        NA      NA     NA   NA
15      PT1A 2017      NA     NA        NA      NA     NA   NA
16      PT1A 2018      NA     NA        NA      NA     NA   NA
17      PT1A 2019      NA     NA        NA      NA     NA   NA
18      PT1A 2020      NA     NA        NA      NA     NA   NA
19      PT1A 2021      NA     NA  77253.60  132.20 305387   NA
20      PT1A 2022      NA     NA  90226.08  143.35 333104   NA
21      PT1A 2023 2097742   44.5 102494.09      NA     NA   NA
22      PT1A 2024 2126578   44.7        NA      NA     NA 1390
23      PT1B 2014      NA     NA        NA      NA     NA   NA
24      PT1B 2015      NA     NA        NA      NA     NA   NA
25      PT1B 2016      NA     NA        NA      NA     NA   NA
26      PT1B 2017      NA     NA        NA      NA     NA   NA
27      PT1B 2018      NA     NA        NA      NA     NA   NA
28      PT1B 2019      NA     NA        NA      NA     NA   NA
29      PT1B 2020      NA     NA        NA      NA     NA   NA
30      PT1B 2021      NA     NA  14134.32   43.50  82565   NA
31      PT1B 2022      NA     NA  15740.99   45.96  90899   NA
32      PT1B 2023  823822   45.5  17196.00      NA     NA   NA
33      PT1B 2024  834599   45.6        NA      NA     NA 1625

nuts_code is the code of an European region, year is the year in which the variable was observed, pop is the population, medage is the median age of the population, gdp is the GDP, selfemp is the number of persons self-employed, entnum is the number of enterprises founded in that year, area is the area of the region.

Basically, under the new NUTS classification (the classification for European regions), in recent years the region PT17 was split in two regions: PT1A and PT1B.

This means that, for example, while the population (pop) for PT17 is missing in 2024, it can be recovered as the sum of the populations for PT1A and PT1B in the same year.

Same thing for the other variables and the other years.

So the exercise is, when a variable for PT17 is missing in a given year, compute it as the sum of the same variable for PT1A and PT1B in that same year.

As an additional complexity, for medage we need to compute the median, not the sum.

I need to build a general function because I have plenty of these cases.

I came up with the following:

# bring new nuts codes into the old one (splitted region that goes back to the old one)
change_nuts <- function(df, old_nuts_code, new_nuts_codes, fns) {
  old_df <- df %>% filter(nuts_code == old_nuts_code)
  stopifnot(nrow(old_df) > 0)
  vars <- colnames(df) %>% setdiff(c("nuts_code", "year"))
  for (var in vars) {
    for (year in sort(unique(old_df$year))) {
      if (!is.na(old_df[old_df$year == year, var])) {
        next
      }
      old_df[old_df$year == year, var] <- df[
        df$nuts_code == new_nuts_codes[1] & df$year == year,
        var
      ]
      for (new_nuts_code in new_nuts_codes[-1]) {
        if (var %in% names(fns)) {
          old_df[old_df$year == year, var] <- do.call(
            fns[[var]],
            list(
              old_df[old_df$year == year, var],
              df[df$nuts_code == new_nuts_code & df$year == year, var]
            )
          )
        } else {
          old_df[old_df$year == year, var] <- sum(
            old_df[old_df$year == year, var],
            df[df$nuts_code == new_nuts_code & df$year == year, var]
          )
        }
      }
    }
  }
  # Remove old NUTS code
  df %<>% filter(nuts_code != old_nuts_code)
  # Remove new NUTS codes
  df %<>% filter(!(nuts_code %in% new_nuts_codes))
  # Bind new dataframe with old nuts code
  df %<>% bind_rows(old_df)
  # Sort by NUTS code and then by year
  df %<>% arrange(nuts_code, year)
  return(df)
}

which seems to work:

fns <- list("medage" = median)
eu_df %<>% change_nuts("PT17", c("PT1A", "PT1B"), fns = fns)

and then, after

eu_df %>% filter(nuts_code %in% c("PT17", "PT1A", "PT1B"))

I get the expected result:

   nuts_code year     pop medage       gdp selfemp entnum area
1       PT17 2014 2815667   41.9        NA      NA     NA 3015
2       PT17 2015 2820766   42.1        NA      NA     NA 3015
3       PT17 2016 2829408   42.4        NA      NA     NA 3015
4       PT17 2017 2836906   42.7        NA      NA     NA 3015
5       PT17 2018 2849085   42.9        NA      NA     NA 3015
6       PT17 2019 2859422   43.2        NA      NA     NA 3015
7       PT17 2020 2884800   43.5        NA      NA     NA 3015
8       PT17 2021 2884695   44.0  91387.92  175.70 387952 3015
9       PT17 2022 2883645   44.5 105967.07  189.31 424003 3015
10      PT17 2023 2921564   44.8 119690.09      NA     NA 3015
11      PT17 2024 2961177   44.7        NA      NA     NA 3015

With the missing variables for PT17 computed using the new regions.

The problem is that my function seems, well, ugly.

Is there a way to do this with tidyverse?

6
  • Out of curiosity, how would you interpret a median of age medians of 2 differently sized groups? PT1A_2024 = (N: 2.1M; Mdn: 44.7) and PT1B_2024 = (N: 0.8M; Mdn: 45.6). Commented Nov 3 at 10:59
  • robertspierre, how goes it? Commented Nov 6 at 12:44
  • @margusl I don't know exactly, but that seems the best I can do? Commented Nov 6 at 13:59
  • @robertspierre , I genuinely think it's an interesting problem but it can't be too original. When I asked, I was thinking more about extreme cases of highly skewed age distributions in groups where population size can vary even more than in your case, and perhaps approaching this through pop-weighted mean ages, which still would need a remark when used for a plot or report. Commented Nov 6 at 15:18
  • 1
    @margusl Yeah but that R package is bugged for me, so I have to do it manually. NUTS classification is a real mess. And Eurostat even provides different data under different NUTS classifications. Commented Nov 6 at 15:23

4 Answers 4

2

If you can only do the sum, the following should suffice:

eu_df %>%
   reshape2::recast(variable+year~nuts_code, id.var = 1:2) %>%
   mutate(PT17 = coalesce(PT17, PT1A + PT1B), .keep = 'unused')%>%
   pivot_wider(names_from = variable, values_from = PT17)

# A tibble: 11 × 7
    year     pop medage     gdp selfemp entnum  area
   <dbl>   <dbl>  <dbl>   <dbl>   <dbl>  <dbl> <dbl>
 1  2014 2815667   41.9     NA      NA      NA  3015
 2  2015 2820766   42.1     NA      NA      NA  3015
 3  2016 2829408   42.4     NA      NA      NA  3015
 4  2017 2836906   42.7     NA      NA      NA  3015
 5  2018 2849085   42.9     NA      NA      NA  3015
 6  2019 2859422   43.2     NA      NA      NA  3015
 7  2020 2884800   43.5     NA      NA      NA  3015
 8  2021 2884695   44    91388.    176. 387952  3015
 9  2022 2883645   44.5 105967.    189. 424003  3015
10  2023 2921564   44.8 119690.     NA      NA  3015
11  2024 2961177   90.3     NA      NA      NA  3015

To compute the median, Note that the median of 2 numbers is just the mean of those two numbers. Thus we could do:

eu_df %>%
   reshape2::recast(variable+year~nuts_code, id.var = 1:2) %>%
   mutate(PT17_1 = PT1A +  PT1B, 
          PT17_2 = ifelse(variable == 'medage', PT17_1/2, PT17_1),
          PT17 = coalesce(PT17, PT17_2)) %>%
   pivot_wider(names_from = variable, values_from = PT17, id_cols = year)
# A tibble: 11 × 7
    year     pop medage     gdp selfemp entnum  area
   <dbl>   <dbl>  <dbl>   <dbl>   <dbl>  <dbl> <dbl>
 1  2014 2815667   41.9     NA      NA      NA  3015
 2  2015 2820766   42.1     NA      NA      NA  3015
 3  2016 2829408   42.4     NA      NA      NA  3015
 4  2017 2836906   42.7     NA      NA      NA  3015
 5  2018 2849085   42.9     NA      NA      NA  3015
 6  2019 2859422   43.2     NA      NA      NA  3015
 7  2020 2884800   43.5     NA      NA      NA  3015
 8  2021 2884695   44    91388.    176. 387952  3015
 9  2022 2883645   44.5 105967.    189. 424003  3015
10  2023 2921564   44.8 119690.     NA      NA  3015
11  2024 2961177   45.2     NA      NA      NA  3015
Sign up to request clarification or add additional context in comments.

4 Comments

When using {dplyr} it might be better to use dplyr::if_else() instead of base::ifelse(). (+1)
Thank you very much! Out of curiosity, can we replace recast with tidyverse pivot_longer?
We can replace recast with pivot_longer +pivot_wider
robertspierre, that recast call is actually doing a wider/longer double pivot; I'm no expert in recast, but I think an approximation is select(eu_df, nuts_code, year, pop, area) |> pivot_wider(id_cols = year, names_from = nuts_code, values_from = c(pop, area)) |> pivot_longer(cols = -year, names_pattern = "(.*)_(.*)", names_to = c("variable", ".value"))
2

I think this can be done by summarizing on "PT1A" and "PT1B", then joining it back on eu_df and coalesceing it.

filter(eu_df, nuts_code %in% c("PT1A", "PT1B")) |>
  summarize(
    .by = year,
    across(-c(nuts_code, medage), ~ sum(.x)),
    medage = median(medage)
  ) |>
  right_join(
    filter(eu_df, nuts_code %in% "PT17"),
    join_by(year), suffix = c(".y", "")) |>
  mutate(
    across(ends_with(".y"), ~ coalesce(pull(pick(sub("\\.y$", "", cur_column()))), .x),
           .names = "{sub('\\\\.y$', '', .col)}")
  ) |>
  select(-ends_with(".y"))
#    year nuts_code     pop medage       gdp selfemp entnum area
# 1  2014      PT17 2815667  41.90        NA      NA     NA 3015
# 2  2015      PT17 2820766  42.10        NA      NA     NA 3015
# 3  2016      PT17 2829408  42.40        NA      NA     NA 3015
# 4  2017      PT17 2836906  42.70        NA      NA     NA 3015
# 5  2018      PT17 2849085  42.90        NA      NA     NA 3015
# 6  2019      PT17 2859422  43.20        NA      NA     NA 3015
# 7  2020      PT17 2884800  43.50        NA      NA     NA 3015
# 8  2021      PT17 2884695  44.00  91387.92  175.70 387952 3015
# 9  2022      PT17 2883645  44.50 105967.07  189.31 424003 3015
# 10 2023      PT17 2921564  44.80 119690.09      NA     NA 3015
# 11 2024      PT17 2961177  45.15        NA      NA     NA 3015

I chose to use pop:area as the selector, that relies wholly on the order of your data. If you'd prefer, a slightly safer and more general approach (at a small cost of readability) is:

filter(eu_df, nuts_code %in% c("PT1A", "PT1B")) |>
  summarize(
    .by = year,
    across(-c(nuts_code, medage), ~ sum(.x)),
    medage = median(medage)
  ) |>
  right_join(
    filter(eu_df, nuts_code %in% "PT17"),
    join_by(year), suffix = c("", ".y")) |>
  mutate(
    across(ends_with(".y"), ~ coalesce(pull(pick(sub("\\.y$", "", cur_column()))), .x),
           .names = "{sub('\\\\.y$', '', .col)}")
  ) |>
  select(-ends_with(".y"))

(Edited to change medage from sum to median. Edited again to fix the order of joining, remedying the medage for 2023.)

The pattern here is to use <aggregating function> on the columns as you need in the summarize(..) step. I originally had sum for all of that, I fixed the "median" thing and now we are calculating median(medage) and sum(.) on all others. If you have other columns where you need another aggregating function (min, max, mean, etc) then you can just modify the summarize(..) step to do what you need. (For example, I suspect area should not be summed ...)

9 Comments

As an additional complexity, for medage we need to compute the median, not the sum.
Also your medage 2023 should be 44.8. I do not think OP wants to overwrite the existing values, but rather just recover the missing values ie So the exercise is, when a variable for PT17 is missing in a given year, compute it as the sum of the same variable for PT1A and PT1B in that same year.
Yes, just replace the missing values and leave the existing ones where they are. Isn't that what coalesce does?
Yes, medage is wrong
That was well above my pay grade. Thank you so much!
it's not that hard, really, just augment the across(-nuts_code) so that your medage uses median? fixed
@Onyambu "do not think OP wants to overwrite the existing values", using coalesce(.) it should not be overwriting anything that has an existing value. Are you seeing something different happen here?
Your code replaces the existing values. eg medage 2023 should remain 44.8
|
2

We assume that the change_nuts function signature should be the same as in the question and the key columns (nuts_code, year) have those names. There may be any number of other columns and they may have any names.

Replace each occurrence in the nuts_code column of the new_nuts_codes with old_nuts_code and then summarize aggregating using the function found in fns or if no function for that column then sum_. The functions sum_ and median_ from hablar are like sum and median except they default to na.rm = TRUE and also if all values are NA they return NA. hablar has _ functions for common R functions. New ones can be created using the hablar s function.


library(dplyr)
library(hablar)

change_nuts <- function(df, old_nuts_code, new_nuts_codes, fns) {
  
  df %>%
    mutate(nuts_code = replace(x = nuts_code, 
      list = nuts_code %in% new_nuts_codes, 
      values = old_nuts_code)) %>%
    summarize(across(everything(), ~ (fns[[cur_column()]] %||% sum_)(.x)), 
      .by = c(nuts_code, year))
}
   
# test
fns <- list("medage" = median_)
eu_df %>% change_nuts("PT17", c("PT1A", "PT1B"), fns = fns)

##    nuts_code year     pop medage       gdp selfemp entnum area
## 1       PT17 2014 2815667  41.90        NA      NA     NA 3015
## 2       PT17 2015 2820766  42.10        NA      NA     NA 3015
## 3       PT17 2016 2829408  42.40        NA      NA     NA 3015
## 4       PT17 2017 2836906  42.70        NA      NA     NA 3015
## 5       PT17 2018 2849085  42.90        NA      NA     NA 3015
## 6       PT17 2019 2859422  43.20        NA      NA     NA 3015
## 7       PT17 2020 2884800  43.50        NA      NA     NA 3015
## 8       PT17 2021 2884695  44.00  91387.92  175.70 775904 3015
## 9       PT17 2022 2883645  44.50 105967.07  189.31 424003 3015
## 10      PT17 2023 5843128  44.80 119690.09      NA     NA 3015
## 11      PT17 2024 2961177  45.15        NA      NA     NA 3015

1 Comment

Thank you, I didn't know hablar. I'll definitely check it out!
1

Since you mention you "have plenty of these cases," one way you could scale this up would be to make a table that explicitly maps nuts_code "children" to their "parent." For instance,

nuts_map <- tibble::tribble(~parent, ~child,
                            "PT17", "PT1A",
                            "PT17", "PT1B")

Then we could calc the child data #s and then use it to fill in missing values in the parent data:

child_data <- eu_df |>
  inner_join(nuts_map, join_by(nuts_code == child)) |>
  summarize(across(c(pop, gdp:area), sum), 
            across(medage, median),
            .by = c(year, parent))
eu_df |>
  anti_join(nuts_map, join_by(nuts_code == child)) |>
  rows_patch(child_data |> rename(nuts_code = parent))

Result:

   nuts_code year     pop medage       gdp selfemp entnum area
1       PT17 2014 2815667  41.90        NA      NA     NA 3015
2       PT17 2015 2820766  42.10        NA      NA     NA 3015
3       PT17 2016 2829408  42.40        NA      NA     NA 3015
4       PT17 2017 2836906  42.70        NA      NA     NA 3015
5       PT17 2018 2849085  42.90        NA      NA     NA 3015
6       PT17 2019 2859422  43.20        NA      NA     NA 3015
7       PT17 2020 2884800  43.50        NA      NA     NA 3015
8       PT17 2021 2884695  44.00  91387.92  175.70 387952 3015
9       PT17 2022 2883645  44.50 105967.07  189.31 424003 3015
10      PT17 2023 2921564  44.80 119690.09      NA     NA 3015
11      PT17 2024 2961177  45.15        NA      NA     NA 3015

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.