2

I have a dataframe that looks like so:

df = structure(list(Date_Time_GMT_3 = 
                      structure(c(1622552400, 1622553300,1622554200, 1622555100, 1622556000, 1622556900), 
                                class = c("POSIXct","POSIXt"), 
                                tzone = "EST"),
                    X20819830_R1AR_U_Stationary = c(NA_real_, NA_real_, NA_real_, 16.808, 16.713, 17.753), 
                    X20819742_R1AR_S_Stationary = c(16.903, 16.828, 16.808, NA_real_, NA_real_, NA_real_), 
                    X20822215_R3AR_U_Stationary = c(NA_real_, NA_real_, NA_real_, 13.942, 13.942, 13.846), 
                    X20822215_R3AR_S_Stationary = c(13.942, 13.972, 13.842, NA_real_, NA_real_, NA_real_), 
                    X20874235_R4AR_U_Stationary = c(NA_real_, NA_real_, NA_real_, 14.134, 14.534, 14.404), 
                    X20874235_R4AR_S_Stationary = c(14.23, 14.23, 14.134, NA_real_, NA_real_, NA_real_), 
                    X20874311_F1AR_U_Stationary = c(NA_real_, NA_real_, NA_real_, 15.187, 15.327, 15.567), 
                    X20874311_F1AR_S_Stationary = c(15.282, 15.387, 15.587, NA_real_, NA_real_, NA_real_), 
                    X20817727_F8AR_U = c(15.421, 14.441, 14.631, 14.781, 15.521, 15.821), 
                    X20819742_X1AR_U = c(14.996, 15.996, 14.776, 14.920, 14.870, 14.235), 
                    X20819742_R2AR_U = c(14.781, 15.521, 15.821, NA_real_, NA_real_, NA_real_), 
                    X20817727_R5AR_U = c(NA_real_, NA_real_, NA_real_, 13.942, 13.942, 13.846), 
                    X20817727_R7AR = c(14.23, 14.23, 14.134, NA_real_, NA_real_, NA_real_)), 
               row.names = c(NA, 6L), class = "data.frame")

Based off results I calculated with linear models I want to predict the missing values in this dataframe. Here is an example of the results I have for the linear models

df_HighR = structure(list(response = c("X20817727_F8AR_U", "X20817727_R5AR_U", 
"X20817727_R7AR", "X20819742_R2AR_U", "X20819742_X1AR_U"), predictor = c("X20819742_R1AR_S_Stationary", 
"X20822215_R3AR_U_Stationary", "X20874235_R4AR_S_Stationary", 
"X20819742_R1AR_S_Stationary", "X20822215_R3AR_U_Stationary"), 
    r.squared = c(0.859062596478993, 1, 1, 0.993125520793874, 
    0.995714040802335)), class = c("grouped_df", "tbl_df", "tbl", 
"data.frame"), row.names = c(NA, -5L), groups = structure(list(
    response = c("X20817727_F8AR_U", "X20817727_R5AR_U", "X20817727_R7AR", 
    "X20819742_R2AR_U", "X20819742_X1AR_U"), .rows = structure(list(
        1L, 2L, 3L, 4L, 5L), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), row.names = c(NA, -5L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE))

Essentially, every column that has NA needs to be run through the predict.lm() function, against the column that it matches in the df_HighR dataframe (ex. Column X20817727_F8AR_U has NA values that will predicted by X20819742_R1AR_S_Stationary

I have code that works but I'm wondering if there is a way to simplify it. Code below:


#Make the linear model for best R squared for each mobile logger
model_F8AR = lm(df$`20817727_F8AR_U` ~ df$`20822215_R3AR_Stationary`)
summary(model_F8AR)


model_R2AR = lm(df$`20819742_R2AR_U` ~ df$`20822215_R3AR_Stationary`)
summary(model_R2AR)

model_R5AR = lm(df$`20817727_R5AR_U` ~ df$`20874311_F1AR_Stationary`)
summary(model_R5AR)

model_X1AR = lm(df$`20819742_X1AR_U` ~ df$`20874311_F1AR_Stationary`)
summary(model_X1AR)


########Predict the values for mobile loggers
#F8AR
Predicted_F8AR = predict.lm(model_F8AR,new=as.data.frame(df$`20822215_R3AR_Stationary`), interval="confidence")
Predicted_F8AR = as.data.frame(Predicted_F8AR)
names(Predicted_F8AR)[1] = "F8AR_Predicted"
names(Predicted_F8AR)[2] = "F8AR_lwr"
names(Predicted_F8AR)[3] = "F8AR_upr"

#R2AR
Predicted_R2AR = predict.lm(model_R2AR,new=as.data.frame(df$`20822215_R3AR_Stationary`), interval="confidence")
Predicted_R2AR = as.data.frame(Predicted_R2AR)
names(Predicted_R2AR)[1] = "R2AR_Predicted"
names(Predicted_R2AR)[2] = "R2AR_lwr"
names(Predicted_R2AR)[3] = "R2AR_upr"

#R5AR
Predicted_R5AR = predict.lm(model_R5AR,new=as.data.frame(df$`20874311_F1AR_Stationary`), interval="confidence")
Predicted_R5AR = as.data.frame(Predicted_R5AR)
names(Predicted_R5AR)[1] = "R5AR_Predicted"
names(Predicted_R5AR)[2] = "R5AR_lwr"
names(Predicted_R5AR)[3] = "R5AR_upr"


#X1AR
Predicted_X1AR = predict.lm(model_X1AR,new=as.data.frame(df$`20874311_F1AR_Stationary`), interval="confidence")
Predicted_X1AR = as.data.frame(Predicted_X1AR)
names(Predicted_X1AR)[1] = "X1AR_Predicted"
names(Predicted_X1AR)[2] = "X1AR_lwr"
names(Predicted_X1AR)[3] = "X1AR_upr"

Any ideas of how to clean this up?

1 Answer 1

1

We may use map2 to loop over the 'response', 'predictor' from 'df_HighR' dataset, build the lm, get the prediction as list columns

library(purrr)
library(dplyr)
out <- df_HighR %>%
     ungroup %>%
      mutate(Model = map2(response, predictor,
     ~ lm(reformulate(.y, response = .x), data = df)), 
    predicted = map2(Model, predictor, 
     ~ as.data.frame(predict.lm(.x, new = df[.y], interval = "confidence"))))

-output

> out
# A tibble: 5 × 5
  response         predictor                   r.squared Model  predicted   
  <chr>            <chr>                           <dbl> <list> <list>      
1 X20817727_F8AR_U X20819742_R1AR_S_Stationary     0.859 <lm>   <df [6 × 3]>
2 X20817727_R5AR_U X20822215_R3AR_U_Stationary     1     <lm>   <df [6 × 3]>
3 X20817727_R7AR   X20874235_R4AR_S_Stationary     1     <lm>   <df [6 × 3]>
4 X20819742_R2AR_U X20819742_R1AR_S_Stationary     0.993 <lm>   <df [6 × 3]>
5 X20819742_X1AR_U X20822215_R3AR_U_Stationary     0.996 <lm>   <df [6 × 3]>

The output could be unnested

library(tidyr)
out %>%
   unnest(predicted)
# A tibble: 30 × 7
   response         predictor                   r.squared Model    fit   lwr   upr
   <chr>            <chr>                           <dbl> <list> <dbl> <dbl> <dbl>
 1 X20817727_F8AR_U X20819742_R1AR_S_Stationary     0.859 <lm>    15.4  11.9  18.8
 2 X20817727_F8AR_U X20819742_R1AR_S_Stationary     0.859 <lm>    14.7  12.4  16.9
 3 X20817727_F8AR_U X20819742_R1AR_S_Stationary     0.859 <lm>    14.5  11.7  17.2
 4 X20817727_F8AR_U X20819742_R1AR_S_Stationary     0.859 <lm>    NA    NA    NA  
 5 X20817727_F8AR_U X20819742_R1AR_S_Stationary     0.859 <lm>    NA    NA    NA  
 6 X20817727_F8AR_U X20819742_R1AR_S_Stationary     0.859 <lm>    NA    NA    NA  
 7 X20817727_R5AR_U X20822215_R3AR_U_Stationary     1     <lm>    NA    NA    NA  
 8 X20817727_R5AR_U X20822215_R3AR_U_Stationary     1     <lm>    NA    NA    NA  
 9 X20817727_R5AR_U X20822215_R3AR_U_Stationary     1     <lm>    NA    NA    NA  
10 X20817727_R5AR_U X20822215_R3AR_U_Stationary     1     <lm>    13.9  13.9  13.9
# … with 20 more rows
Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.