#' Evaluate how new typicality ratings predict human ratings and compares performance to LLM baselines
#'
#' @description
#' This function compares external typicality ratings (e.g., generated by a new LLM)
#' against the validation dataset included in 'baserater'. The validation set contains
#' average typicality ratings collected from 50 Prolific participants on a subset of
#' 100 group–adjective pairs, as described in the accompanying paper.
#'
#' The input ratings are merged with this reference set, and then:
#' \enumerate{
#'   \item Computes a correlation (`cor.test`) between the external ratings and the human average;
#'   \item Compares it to one or more built-in model baselines (default: 'GPT-4' and 'LLaMA 3.3');
#'   \item Prints a clear summary of all correlation coefficients and flags whether the external model outperforms each baseline;
#'   \item Returns a tidy result invisibly.
#' }
#'
#' @param df A data frame with columns `adjective`, `group`, and `rating`. Must contain
#'           typicality scores for all 100 validation items used in the original study.
#' @param method The correlation method to use in [stats::cor.test()]. Must be one of:
#'               `"pearson"` (default), `"spearman"`, or `"kendall"`.
#' @param baselines Character vector of column names in the validation set to compare against
#'                  (default: `c("mean_gpt4_rating", "mean_llama3_rating")`).
#' @param verbose Logical. If `TRUE` (default), prints a summary of the correlations
#'                and baseline comparisons. Set to `FALSE` to suppress console output.
#'
#' @return A tibble (invisibly) with one row per model (`external` and each baseline),
#'         and columns `model`, `r`, and `p` for the correlation coefficient and p-value.
#' @export
#' @examples
#' \dontrun{
#' new_scores <- tibble::tibble(
#'   group = ratings$group,
#'   adjective = ratings$adjective,
#'   rating = runif(100)  # Replace with model predictions
#' )
#' evaluate_external_ratings(new_scores)
#' }

evaluate_external_ratings <- function(df,
                                      method    = "pearson",
                                      baselines = c("mean_gpt4_rating",
                                                    "mean_llama3_rating"),
                                      verbose   = TRUE) {

  df <- dplyr::rename_with(df, tolower)
  needed <- c("adjective", "group", "rating")
  if (!all(needed %in% names(df))) {
    stop("`df` must contain columns: ", paste(needed, collapse = ", "))
  }

  df <- dplyr::mutate(df,
                      rating = as.numeric(rating)) |>
    tidyr::drop_na(rating)

  if (anyNA(df$rating))
    warning("NAs in `rating` were dropped before analysis.")

  # --- Load validation set --------------------------------------------------
  ratings <- download_data("validation_ratings")

  merged <- dplyr::inner_join(df, ratings,
                              by = c("adjective", "group"),
                              keep = FALSE)

  if (nrow(merged) != 100) {
    stop(glue::glue("Your data frame must contain ratings for all 100 validation items.\nMatched only {nrow(merged)} rows."))
  }

  # --- Correlation helper ---------------------------------------------------
  get_cor <- function(name, x, y) {
    ct <- stats::cor.test(x, y, method = method)
    tibble::tibble(model = name,
                   r     = unname(ct$estimate),
                   p     = ct$p.value)
  }

  # --- External model -------------------------------------------------------
  out <- get_cor("external", merged$rating, merged$mean_human_rating)

  # --- Baselines ------------------------------------------------------------
  for (b in baselines) {
    if (b %in% names(merged)) {
      out <- dplyr::bind_rows(out,
                              get_cor(b,
                                      merged[[b]],
                                      merged$mean_human_rating))
    }
  }

  # --- Pretty print summary (conditionally) ---------------------------------
  nice_r <- function(x) format(round(x, 3), nsmall = 3)
  nice_p <- function(x) format.pval(x, digits = 2, eps = 1e-4)

  if (isTRUE(verbose)) {
    if (requireNamespace("cli", quietly = TRUE)) {
      cli::cli_alert_success(
        "New ratings: r = {nice_r(out$r[1])} (p = {nice_p(out$p[1])})"
      )

      for (i in 2:nrow(out)) {
        outp <- out$r[1] > out$r[i]
        msg <- if (outp) "outperform" else "do NOT outperform"
        cli::cli_alert_info(
          "{out$model[i]} baseline: r = {nice_r(out$r[i])} (p = {nice_p(out$p[i])}) - new ratings {msg} this baseline."
        )
      }

    } else {
      message("New ratings: r = ", nice_r(out$r[1]),
              " (p = ", nice_p(out$p[1]), ")")
      for (i in 2:nrow(out)) {
        outp <- out$r[1] > out$r[i]
        msg <- if (outp) "outperform" else "do NOT outperform"
        message(out$model[i], " baseline: r = ", nice_r(out$r[i]),
                " (p = ", nice_p(out$p[i]),
                ") - new ratings ", msg, " this baseline.")
      }
    }
  }

  invisible(out)
}
