library(tidyverse)
library(jsonlite)
library(sf)
library(lutz)

meters_to_miles <- function(x) {
  x / 1609.344
}

# Start and end timestamps ------------------------------------------------

official_times <- list(
  there = list(
    start = ymd_hms("2023-06-03 11:34:01", tz = "UTC"),
    end = ymd_hms("2023-06-09 21:06:03", tz = "UTC")
  ),
  back_again = list(
    start = ymd_hms("2023-06-20 14:58:26", tz = "UTC"),
    end = ymd_hms("2023-06-25 16:51:12", tz = "UTC")
  )
)


# Places that weren't gas or bathroom breaks
not_driving_stops <- c(
  "ChIJnwDSTcsDnogRLyt_lqVprLY",  # Hotel in New Orleans, LA
  "ChIJgcNDAhKmIIYRRA4mio_7VgE",  # Parking in the French Quarter
  "ChIJv30_Xw-mIIYRpt26QbLBh58",  # Louis Armstrong Park
  "ChIJ59n6fW8dnogRWi-5N6olcyU",  # Chalmette Battlefield
  "ChIJ_7z4c1_2XIYR6b9p0NvEiVE",  # Hotel in San Antonio, TX
  "ChIJX4k2TVVfXIYRIsTnhA-P-Rc",  # The Alamo
  "ChIJAdu5Qad544YRhyJT8qzimi4",  # Hotel in Carlsbad, NM
  "ChIJW9e4xBN544YRvbI7vfc91G4",  # Carlsbad Caverns
  "ChIJERiZZMWOLYcRQbo78w80s34",  # Hotel in Flagstaff, AZ
  "ChIJrSLbJJIQM4cR4l5HTswDY8k",  # Grand Canyon
  "ChIJw_8NjzERSocRoMj48srdz9c",  # Cabin in Grover, UT
  "ChIJU6LnB_8ASocRB_9PSFPsO94",  # Capitol Reef
  "ChIJIw-BhQkZSocRncIWG0YMLJU",  # Capitol Reef Visitor Center
  "ChIJ6VbxQZIXSocR-SpwZ6W5ens",  # Capitol Reef Goosenecks
  "ChIJaSHejn29SYcR-YzTt_DNlTg",  # Goblin Valley
  "ChIJUUCyjPG8TYcR50RxmIdxSNw",  # Aunt's house in Spanish Fork, UT
  "ChIJVQ4oZOP4VFMREjDKbf7bHIE",  # Sister's house in Shelley, ID
  "ChIJp4yR8asLVFMRJJExTuHrYEs",  # Porter's Park in Rexburg, ID
  "ChIJGyd1jm8LVFMRzgcOZZumVzU",  # Hotel in Rexburg, ID
  "ChIJ3zGqpb65UVMR0rTSaqVZ5kc",  # Yellowstone
  "ChIJXy5ZgRvtUVMRoSJoWid8Owg",  # Old Faithful
  "ChIJSWHsxv8JTlMR82z8b6wF_BM",  # Place that had snow on the side of the road in Yellowstone
  "ChIJzTXkoTKUNFMRRC1n33hYsEQ",  # Hotel in Gilette, WY
  "ChIJOT5U8z8GM1MResed1BOdJKk",  # Devil's Tower
  "ChIJ39Y-tdg1fYcRQcZcBb499do",  # Mount Rushmore
  "ChIJWWVl4b_KjocRCMsTmFCHahQ",  # Hotel in Sioux Falls, SD
  "ChIJg_2MNnKRk4cRQGXbuvgqba4",  # Visitors Center in Winter Quarters
  "ChIJ36q1kAYm54cR36q-3xRQA4Y",  # Hotel in Nauvoo, IL
  "ChIJh53YJHIm54cRmpf8_ZA3CVw",  # Nauvoo visitors center
  "ChIJDUPPu3Im54cRKj6BG8UkOko",  # Nauvoo temple
  "ChIJg0abVHYm54cR85yQbfLjt2o",  # Nauvoo family living center
  "ChIJm7cRetkl54cR-lEKk-eZnXA",  # Smith family cemetery
  "ChIJZ6tHUwsm54cRbmWsF639PjY",  # Carthage jail
  "ChIJtUcJ-n36ZIgRhzY2PM19eWA",  # Hotel in Nashville, TN
  "ChIJGUwR7Juh9YgRHxAIB27Mi-U"   # Home
)

# Records.json ------------------------------------------------------------

all_locations_raw <- read_json("data_real/Records.json", simplifyVector = TRUE) %>% 
  # Pull out the "locations" slot (this is the same as doing full_data$locations)
  pluck("locations") %>% 
  # Make this a tibble just so it prints nicer here on the blog
  as_tibble() 

all_locations <- all_locations_raw %>% 
  mutate(timestamp = ymd_hms(timestamp, tz = "UTC")) %>% 
  filter(
    (timestamp >= official_times$there$start & 
        timestamp <= official_times$there$end) |
      (timestamp >= official_times$back_again$start & 
          timestamp <= official_times$back_again$end)
  ) %>% 
  # Scale down the location data (divide any column that ends in E7 by 10000000)
  mutate(across(ends_with("E7"), ~ . / 1e7)) %>% 
  # Create a geometry column with the coordinates
  st_as_sf(coords = c("longitudeE7", "latitudeE7"), crs = st_crs("EPSG:4326")) %>% 
  # Make a column with the time zone for each point
  mutate(tz = tz_lookup(., method = "accurate")) %>% 
  # Convert the timestamp to an actual UTC-based timestamp
  mutate(timestamp = ymd_hms(timestamp, tz = "UTC")) %>% 
  # Create a version of the timestamp in local time, but in UTC
  group_by(tz) %>% 
  mutate(timestamp_local = force_tz(with_tz(timestamp, tz), "UTC")) %>% 
  ungroup() %>% 
  # Add a column for direction
  mutate(
    direction = ifelse(timestamp <= ymd("2023-06-15", tz = "UTC"), "There", "Back again"),
    direction = fct_inorder(direction)
  ) %>% 
  # Add some helper columns for filtering, grouping, etc.
  mutate(
    year = year(timestamp_local),
    month = month(timestamp_local),
    day = day(timestamp_local)
  ) %>% 
  mutate(
    day_month = strftime(timestamp_local, "%B %e"),
    # With %e, there's a leading space for single-digit numbers, so we remove
    # any double spaces and replace them with single spaces 
    # (e.g., "June  3" becomes "June 3")
    day_month = str_replace(day_month, "  ", " "),
    day_month = fct_inorder(day_month)
  )

# Combine all the points in the day into a connected linestring
daily_routes <- all_locations %>% 
  group_by(day_month) %>% 
  nest() %>% 
  mutate(path = map(data, ~st_cast(st_combine(.), "LINESTRING"))) %>% 
  unnest(path) %>% 
  st_set_geometry("path")


# Semantic location history -----------------------------------------------

## placeVisits ------------------------------------------------------------

# Computer friendly timezones like America/New_York work for computers, but I
# want to sometimes show them as US-standard abbreviations like EDT (Eastern
# Daylight Time), so here's a little lookup table we can use to join to bigger
# datasets for better abbreviations
tz_abbreviations <- tribble(
  ~tz,                ~tz_abb,
  "America/New_York", "EDT",
  "America/Chicago",  "CDT",
  "America/Denver",   "MDT",
  "America/Phoenix",  "MST",
  "America/Boise",    "MDT"
)

place_visits_raw <- read_json(
  "data_real/Semantic Location History/2023/2023_JUNE.json", 
  simplifyVector = FALSE
) %>% 
  # Extract the timelineObjects JSON element
  pluck("timelineObjects") %>%
  # Filter the list to only keep placeVisits
  # { More verbose function-based approach: map(~ .x[["placeVisit"]]) }
  # Neat selection-based approach with just the name!
  map("placeVisit") %>% 
  # Discard all the empty elements (i.e. the activitySegments)
  compact()

place_visits <- place_visits_raw %>% 
  # Extract parts of the nested list
  map(~{
    tibble(
      id = .x$location$placeId,
      latitudeE7 = .x$location$latitudeE7 / 1e7,
      longitudeE7 = .x$location$longitudeE7 / 1e7,
      name = .x$location$name,
      address = .x$location$address,
      startTimestamp = ymd_hms(.x$duration$startTimestamp, tz = "UTC"),
      endTimestamp = ymd_hms(.x$duration$endTimestamp, tz = "UTC")
    )
  }) %>% 
  list_rbind() %>% 
  filter(
    (endTimestamp >= official_times$there$start & 
        startTimestamp <= official_times$there$end) |
      (endTimestamp >= official_times$back_again$start & 
          startTimestamp <= official_times$back_again$end)
  ) %>%
  # Calculate the duration of the stop
  mutate(duration = endTimestamp - startTimestamp) %>% 
  # Make an indicator for if the stop was a gas or bathroom break
  mutate(driving_stop = !(id %in% not_driving_stops)) %>%
  # Make a geometry column
  st_as_sf(coords = c("longitudeE7", "latitudeE7"), crs = st_crs("EPSG:4326")) %>% 
  # Make a column with the time zone for each point
  mutate(tz = tz_lookup(., method = "accurate")) %>% 
  # Create a version of the timestamp in local time, but in UTC
  group_by(tz) %>% 
  mutate(
    startTimestamp_local = force_tz(with_tz(startTimestamp, tz), "UTC"),
    endTimestamp_local = force_tz(with_tz(endTimestamp, tz), "UTC")
  ) %>% 
  ungroup() %>% 
  # Add a column for direction
  mutate(
    direction = ifelse(startTimestamp <= ymd("2023-06-15", tz = "UTC"), "There", "Back again"),
    direction = fct_inorder(direction)
  ) %>% 
  # Add some helper columns for filtering, grouping, etc.
  mutate(
    year = year(startTimestamp_local),
    month = month(startTimestamp_local),
    day = day(startTimestamp_local)
  ) %>% 
  # The first stop of each direction of the trip starts on the previous day
  # (since we slept either at home or at my aunt's house in Spanish Fork, Utah),
  # so use the ending time (i.e. the departure time) for the day_month for those
  # entries
  group_by(direction) %>% 
  mutate(
    day_month = ifelse(
      row_number() == 1, 
      strftime(endTimestamp_local, "%B %e"), 
      strftime(startTimestamp_local, "%B %e")
    ),
    # With %e, there's a leading space for single-digit numbers, so we remove
    # any double spaces and replace them with single spaces 
    # (e.g., "June  3" becomes "June 3")
    day_month = str_replace(day_month, "  ", " "),
    day_month = fct_inorder(day_month)
  ) %>%
  ungroup() %>% 
  # Bring in abbreviated time zones
  left_join(tz_abbreviations, by = join_by(tz)) %>% 
  # Fix some missing values + anonymize some addresses
  mutate(
    name = case_when(
      id == "ChIJGUwR7Juh9YgRHxAIB27Mi-U" ~ "Home",
      id == "ChIJVQ4oZOP4VFMREjDKbf7bHIE" ~ "My sister's house",
      id == "ChIJUUCyjPG8TYcR50RxmIdxSNw" ~ "My aunt's house",
      id == "ChIJw_8NjzERSocRoMj48srdz9c" ~ "My aunt's cabin",
      TRUE ~ name
    ),
    address = case_when(
      id == "ChIJGUwR7Juh9YgRHxAIB27Mi-U" ~ "Atlanta, GA, USA",
      id == "ChIJVQ4oZOP4VFMREjDKbf7bHIE" ~ "Shelley, ID, USA",
      id == "ChIJUUCyjPG8TYcR50RxmIdxSNw" ~ "Spanish Fork, UT, USA",
      id == "ChIJw_8NjzERSocRoMj48srdz9c" ~ "Grover, UT, USA",
      TRUE ~ address
    )
  )


## activitySegments -------------------------------------------------------

activity_segments_raw <- read_json(
  "data_real/Semantic Location History/2023/2023_JUNE.json", 
  simplifyVector = FALSE
) %>% 
  # Extract the timelineObjects JSON element
  pluck("timelineObjects") %>%
  # Filter the list to only keep activitySegments
  map("activitySegment") %>%
  # Discard all the empty elements (i.e. the placeVisits)
  compact()

activity_segments_not_clean <- activity_segments_raw %>% 
  # Extract parts of the nested list
  map(~{
    tibble(
      distance_m = .x$distance,
      activity_type = .x$activityType,
      start_latitudeE7 = .x$startLocation$latitudeE7 / 1e7,
      start_longitudeE7 = .x$startLocation$longitudeE7 / 1e7,
      end_latitudeE7 = .x$endLocation$latitudeE7 / 1e7,
      end_longitudeE7 = .x$endLocation$longitudeE7 / 1e7,
      startTimestamp = ymd_hms(.x$duration$startTimestamp, tz = "UTC"),
      endTimestamp = ymd_hms(.x$duration$endTimestamp, tz = "UTC")
    )
  }) %>% 
  list_rbind() %>% 
  filter(
    (endTimestamp >= official_times$there$start & 
        startTimestamp <= official_times$there$end) |
      (endTimestamp >= official_times$back_again$start & 
          startTimestamp <= official_times$back_again$end)
  )

# ↑ that needs to be a separate data frame so that we can refer to it to make a
# geometry column for the end latitude/longitude
activity_segments <- activity_segments_not_clean %>% 
  # Calculate the duration and distance and speed of the segment
  mutate(duration = endTimestamp - startTimestamp) %>% 
  mutate(distance_miles = meters_to_miles(distance_m)) %>% 
  mutate(
    hours = as.numeric(duration) / 60,
    avg_mph = distance_miles / hours
  ) %>% 
  # Make two geometry columns
  st_as_sf(coords = c("start_longitudeE7", "start_latitudeE7"), crs = st_crs("EPSG:4326")) %>% 
  rename("geometry_start" = "geometry") %>% 
  mutate(geometry_end = st_geometry(
    st_as_sf(
      activity_segments_not_clean, 
      coords = c("end_longitudeE7", "end_latitudeE7"), 
      crs = st_crs("EPSG:4326"))
  )
  ) %>% 
  select(-end_longitudeE7, -end_latitudeE7) %>% 
  # Make a column with the time zone for each point
  mutate(tz_start = tz_lookup(geometry_start, method = "accurate")) %>% 
  mutate(tz_end = tz_lookup(geometry_end, method = "accurate")) %>% 
  # Create a version of the timestamps in local time, but in UTC
  group_by(tz_start) %>% 
  mutate(startTimestamp_local = force_tz(with_tz(startTimestamp, tz_start), "UTC")) %>% 
  ungroup() %>% 
  group_by(tz_end) %>% 
  mutate(endTimestamp_local = force_tz(with_tz(endTimestamp, tz_end), "UTC")) %>% 
  ungroup() %>% 
  # Add a column for direction
  mutate(
    direction = ifelse(startTimestamp <= ymd("2023-06-15", tz = "UTC"), "There", "Back again"),
    direction = fct_inorder(direction)
  ) %>% 
  # Add some helper columns for filtering, grouping, etc.
  mutate(
    year = year(startTimestamp_local),
    month = month(startTimestamp_local),
    day = day(startTimestamp_local)
  ) %>% 
  mutate(
    day_month = strftime(startTimestamp_local, "%B %e"),
    # With %e, there's a leading space for single-digit numbers, so we remove
    # any double spaces and replace them with single spaces 
    # (e.g., "June  3" becomes "June 3")
    day_month = str_replace(day_month, "  ", " "),
    day_month = fct_inorder(day_month)
  ) %>% 
  # Bring in abbreviated time zones for both the start and end time zones
  left_join(
    rename(tz_abbreviations, "tz_start_abb" = "tz_abb"), 
    by = join_by(tz_start == tz)
  ) %>% 
  left_join(
    rename(tz_abbreviations, "tz_end_abb" = "tz_abb"),
    by = join_by(tz_end == tz)
  ) %>% 
  # Create an id column so we can better reference individual activities 
  # Make it a character so it can combine with the place visit id column
  mutate(id = as.character(1:n()))


## All stops and activities -----------------------------------------------

all_stops_activities <- bind_rows(
  list(visit = place_visits, segment = activity_segments),
  .id = "type"
) %>% 
  arrange(startTimestamp)
all_stops_activities


# Save everything ---------------------------------------------------------

output <- tibble::lst(
  official_times, all_locations, tz_abbreviations, 
  place_visits, daily_routes, activity_segments, all_stops_activities
)
saveRDS(output, "data_real/clean_data.rds")

# Load this whole RDS file into the global environment like this:
# invisible(list2env(readRDS("data_real/clean_data.rds"), .GlobalEnv))