% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/prepInputs.R
\name{prepInputs}
\alias{prepInputs}
\title{Download and optionally post-process files}
\usage{
prepInputs(
  targetFile = NULL,
  url = NULL,
  archive = NULL,
  alsoExtract = NULL,
  destinationPath = getOption("reproducible.destinationPath", "."),
  fun = NULL,
  quick = getOption("reproducible.quick"),
  overwrite = getOption("reproducible.overwrite", FALSE),
  purge = FALSE,
  useCache = getOption("reproducible.useCache", 2),
  .tempPath,
  verbose = getOption("reproducible.verbose", 1),
  ...
)
}
\arguments{
\item{targetFile}{Character string giving the filename (without relative or
absolute path) to the eventual file
(raster, shapefile, csv, etc.) after downloading and extracting from a zip
or tar archive. This is the file \emph{before} it is passed to
\code{postProcess}. The internal checksumming does not checksum
the file after it is \code{postProcess}ed (e.g., cropped/reprojected/masked).
Using \code{Cache} around \code{prepInputs} will do a sufficient job in these cases.
See table in \code{\link[=preProcess]{preProcess()}}.}

\item{url}{Optional character string indicating the URL to download from.
If not specified, then no download will be attempted. If not entry
exists in the \code{CHECKSUMS.txt} (in \code{destinationPath}), an entry
will be created or appended to. This \code{CHECKSUMS.txt} entry will be used
in subsequent calls to
\code{prepInputs} or \code{preProcess}, comparing the file on hand with the ad hoc
\code{CHECKSUMS.txt}. See table in \code{\link[=preProcess]{preProcess()}}.}

\item{archive}{Optional character string giving the path of an archive
containing \code{targetFile}, or a vector giving a set of nested archives
(e.g., \code{c("xxx.tar", "inner.zip", "inner.rar")}). If there is/are (an) inner
archive(s), but they are unknown, the function will try all until it finds
the \code{targetFile}. See table in \code{\link[=preProcess]{preProcess()}}. If it is \code{NA},
then it will \emph{not} attempt to see it as an archive, even if it has archive-like
file extension (e.g., \code{.zip}). This may be useful when an R function
is expecting an archive directly.}

\item{alsoExtract}{Optional character string naming files other than
\code{targetFile} that must be extracted from the \code{archive}. If
\code{NULL}, the default, then it will extract all files. Other options:
\code{"similar"} will extract all files with the same filename without
file extension as \code{targetFile}. \code{NA} will extract nothing other
than \code{targetFile}. A character string of specific file names will cause
only those to be extracted. See table in \code{\link[=preProcess]{preProcess()}}.}

\item{destinationPath}{Character string of a directory in which to download
and save the file that comes from \code{url} and is also where the function
will look for \code{archive} or \code{targetFile}. NOTE (still experimental):
To prevent repeated downloads in different locations, the user can also set
\code{options("reproducible.inputPaths")} to one or more local file paths to
search for the file before attempting to download. Default for that option is
\code{NULL} meaning do not search locally.}

\item{fun}{Optional. If specified, this will attempt to load whatever
file was downloaded during \code{preProcess} via \code{dlFun}. This can be either a
function (e.g., sf::st_read), character string (e.g., "base::load"),
NA (for no loading, useful if \code{dlFun} already loaded the file) or
if extra arguments are required
in the function call, it must be a call naming
\code{targetFile} (e.g., \code{sf::st_read(targetFile, quiet = TRUE)})
as the file path to the file to load. See details and examples below.}

\item{quick}{Logical. This is passed internally to \code{\link[=Checksums]{Checksums()}}
(the quickCheck argument), and to
\code{\link[=Cache]{Cache()}} (the quick argument). This results in faster, though
less robust checking of inputs. See the respective functions.}

\item{overwrite}{Logical. Should downloading and all the other actions occur
even if they pass the checksums or the files are all there.}

\item{purge}{Logical or Integer. \code{0/FALSE} (default) keeps existing
\code{CHECKSUMS.txt} file and
\code{prepInputs} will write or append to it. \code{1/TRUE} will deleted the entire
\code{CHECKSUMS.txt} file. Other options, see details.}

\item{useCache}{Passed to \code{Cache} in various places.
Defaults to \code{getOption("reproducible.useCache", 2L)} in \code{prepInputs}, and
\code{getOption("reproducible.useCache", FALSE)} if calling any of the inner
functions manually. For \code{prepInputs}, this mean it will use \code{Cache}
only up to 2 nested levels, which includes \code{preProcess}. \code{postProcess} and
its nested \verb{*Input} functions (e.g., \code{cropInputs}, \code{projectInputs},
\code{maskInputs}) are no longer internally cached, as \code{terra} processing speeds
mean internal caching is more time consuming. We recommend caching the full
\code{prepInputs} call instead (e.g. \code{prepInputs(...) |> Cache()}).}

\item{.tempPath}{Optional temporary path for internal file intermediate steps.
Will be cleared on.exit from this function.}

\item{verbose}{Numeric, -1 silent (where possible), 0 being very quiet,
1 showing more messaging, 2 being more messaging, etc.
Default is 1. Above 3 will output much more information about the internals of
Caching, which may help diagnose Caching challenges. Can set globally with an
option, e.g., \verb{options('reproducible.verbose' = 0) to reduce to minimal}}

\item{...}{Additional arguments passed to
\code{\link[=postProcess]{postProcess()}} and \code{\link[=Cache]{Cache()}}.
Since \code{...} is passed to \code{\link[=postProcess]{postProcess()}}, these will
\code{...} will also be passed into the inner
functions, e.g., \code{\link[=cropInputs]{cropInputs()}}. Possibly useful other arguments include
\code{dlFun} which is passed to \code{preProcess}. See details and examples.}
}
\value{
This is an omnibus function that will return an R object that will have resulted from
the running of \code{\link[=preProcess]{preProcess()}} and \code{\link[=postProcess]{postProcess()}} or \code{\link[=postProcessTo]{postProcessTo()}}. Thus,
if it is a GIS object, it may have been cropped, reprojected, "fixed", masked, and
written to disk.
}
\description{
\if{html}{\figure{lifecycle-maturing.svg}{options: alt="maturing"}}
}
\details{
This function can be used to prepare R objects from remote or local data sources.
The object of this function is to provide a reproducible version of
a series of commonly used steps for getting, loading, and processing data.
This function has two stages: Getting data (download, extracting from archives,
loading into R) and post-processing (for \verb{Spatial*} and \verb{Raster*}
objects, this is crop, reproject, mask/intersect).
To trigger the first stage, provide \code{url} or \code{archive}.
To trigger the second stage, provide \code{studyArea} or \code{rasterToMatch}.
See examples.
}
\note{
This function is still experimental: use with caution.
}
\section{Stage 1 - Getting data}{


See \code{\link[=preProcess]{preProcess()}} for combinations of arguments.

\enumerate{
\item Download from the web via either \code{googledrive::drive_download()},
\code{\link[utils:download.file]{utils::download.file()}};
\item Extract from archive using \code{\link[=unzip]{unzip()}} or \code{\link[=untar]{untar()}};
\item Load into R using \code{terra::rast},
\code{sf::st_read}, or any other function passed in with \code{fun};
\item Checksumming of all files during this process. This is put into a
\file{CHECKSUMS.txt} file in the \code{destinationPath}, appending if it is
already there, overwriting the entries for same files if entries already exist.
}
}

\section{Stage 2 - Post processing}{


This will be triggered if either \code{rasterToMatch} or \code{studyArea}
is supplied.

\enumerate{
\item Fix errors. Currently only errors fixed are for \code{SpatialPolygons}
using \code{buffer(..., width = 0)};
\item Crop using \code{\link[=cropTo]{cropTo()}};
\item Project using \code{\link[=projectTo]{projectTo()}};
\item Mask using \code{\link[=maskTo]{maskTo()}};
\item write the file to disk via \code{\link[=writeTo]{writeTo()}}.
}

NOTE: checksumming does not occur during the post-processing stage, as
there are no file downloads. To achieve fast results, wrap
\code{prepInputs} with \code{Cache}.

NOTE: \code{sf} objects are still very experimental.

\subsection{postProcessing of \verb{Spat*}, \code{sf}, \verb{Raster*} and \verb{Spatial*} objects:}{

The following has been DEPRECATED because there are a sufficient number of
ambiguities that this has been changed in favour of \code{from} and the \verb{*to} family.
See \code{\link[=postProcessTo]{postProcessTo()}}.

DEPRECATED: If \code{rasterToMatch} or \code{studyArea} are used, then this will
trigger several subsequent functions, specifically the sequence,
\emph{Crop, reproject, mask}, which appears to be a common sequence while
preparing spatial data from diverse sources.
See \code{\link[=postProcess]{postProcess()}} documentation section on
\emph{Backwards compatibility with \code{rasterToMatch} and/or \code{studyArea} arguments}
to understand various combinations of \code{rasterToMatch} and/or \code{studyArea}.
}
}

\section{\code{fun}}{


\code{fun} offers the ability to pass any custom function with which to load
the file obtained by \code{preProcess} into the session. There are two cases that are
dealt with: when the \code{preProcess} downloads a file (including via \code{dlFun}),
\code{fun} must deal with a file; and, when \code{preProcess} creates an R object
(e.g., raster::getData returns an object), \code{fun} must deal with an object.

\code{fun} can be supplied in three ways: a function, a character string
(i.e., a function name as a string), or an expression.
If a character string or function, is should have the package name e.g.,
\code{"terra::rast"} or as an actual function, e.g., \code{base::readRDS}.
In these cases, it will evaluate this function call while passing \code{targetFile}
as the first argument. These will only work in the simplest of cases.

When more precision is required, the full call can be written and where the
filename can be referred to as \code{targetFile} if the function
is loading a file. If \code{preProcess} returns an object, \code{fun} should be set to
\code{fun = NA}.

If there is a custom function call, is not in a package, \code{prepInputs} may not find it. In such
cases, simply pass the function as a named argument (with same name as function) to \code{prepInputs}.
See examples.
NOTE: passing \code{fun = NA} will skip loading object into R. Note this will essentially
replicate the functionality of simply calling \code{preProcess} directly.
}

\section{\code{purge}}{


In options for control of purging the \code{CHECKSUMS.txt} file are:

\describe{
\item{\code{0}}{keep file}
\item{\code{1}}{delete file in \code{destinationPath}, all records of downloads need to be rebuilt}
\item{\code{2}}{delete entry with same \code{targetFile}}
\item{\code{4}}{delete entry with same \code{alsoExtract}}
\item{\code{3}}{delete entry with same \code{archive}}
\item{\code{5}}{delete entry with same \code{targetFile} & \code{alsoExtract}}
\item{\code{6}}{delete entry with same \code{targetFile}, \code{alsoExtract} & \code{archive}}
\item{\code{7}}{delete entry that same \code{targetFile}, \code{alsoExtract} & \code{archive} & \code{url}}
}
will only remove entries in the \code{CHECKSUMS.txt} that are associated with
\code{targetFile}, \code{alsoExtract} or \code{archive} When \code{prepInputs} is called,
it will write or append to a (if already exists) \code{CHECKSUMS.txt} file.
If the \code{CHECKSUMS.txt} is not correct, use this argument to remove it.
}

\examples{
\donttest{
if (requireNamespace("terra", quietly = TRUE) &&
  requireNamespace("sf", quietly = TRUE)) {
  library(reproducible)
  # Make a dummy study area map -- user would supply this normally
  coords <- structure(c(-122.9, -116.1, -99.2, -106, -122.9, 59.9, 65.7, 63.6, 54.8, 59.9),
    .Dim = c(5L, 2L)
  )
  studyArea <- terra::vect(coords, "polygons")
  terra::crs(studyArea) <- "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
  # Make dummy "large" map that must be cropped to the study area
  outerSA <- terra::buffer(studyArea, 50000)
  terra::crs(outerSA) <- "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
  tf <- normPath(file.path(tempdir2("prepInputsEx"), "prepInputs2.shp"))
  terra::writeVector(outerSA, tf, overwrite = TRUE)

  # run prepInputs -- load file, postProcess it to the studyArea

  studyArea2 <- prepInputs(
    targetFile = tf, to = studyArea,
    fun = "terra::vect",
    destinationPath = tempdir2()
  ) |>
    suppressWarnings() # not relevant warning here

  # clean up
  unlink("CHECKSUMS.txt")

  ##########################################
  # Remote file using `url`
  ##########################################
  if (internetExists()) {
    data.table::setDTthreads(2)
    origDir <- getwd()
    # download a zip file from internet, unzip all files, load as shapefile, Cache the call
    # First time: don't know all files - prepInputs will guess, if download file is an archive,
    #   then extract all files, then if there is a .shp, it will load with sf::st_read
    dPath <- file.path(tempdir(), "ecozones")
    shpUrl <- "http://sis.agr.gc.ca/cansis/nsdb/ecostrat/zone/ecozone_shp.zip"

    # Wrapped in a try because this particular url can be flaky
    shpEcozone <- try(prepInputs(
      destinationPath = dPath,
      url = shpUrl
    ))
    if (!is(shpEcozone, "try-error")) {
      # Robust to partial file deletions:
      unlink(dir(dPath, full.names = TRUE)[1:3])
      shpEcozone <- prepInputs(
        destinationPath = dPath,
        url = shpUrl
      )
      unlink(dPath, recursive = TRUE)

      # Once this is done, can be more precise in operational code:
      #  specify targetFile, alsoExtract, and fun, wrap with Cache
      ecozoneFilename <- file.path(dPath, "ecozones.shp")
      ecozoneFiles <- c(
        "ecozones.dbf", "ecozones.prj",
        "ecozones.sbn", "ecozones.sbx", "ecozones.shp", "ecozones.shx"
      )
      shpEcozone <- prepInputs(
        targetFile = ecozoneFilename,
        url = shpUrl,
        fun = "terra::vect",
        alsoExtract = ecozoneFiles,
        destinationPath = dPath
      )
      unlink(dPath, recursive = TRUE)

      # Add a study area to Crop and Mask to
      # Create a "study area"
      coords <- structure(c(-122.98, -116.1, -99.2, -106, -122.98, 59.9, 65.73, 63.58, 54.79, 59.9),
        .Dim = c(5L, 2L)
      )
      studyArea <- terra::vect(coords, "polygons")
      terra::crs(studyArea) <- "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"

      #  specify targetFile, alsoExtract, and fun, wrap with Cache
      ecozoneFilename <- file.path(dPath, "ecozones.shp")
      # Note, you don't need to "alsoExtract" the archive... if the archive is not there, but the
      #   targetFile is there, it will not redownload the archive.
      ecozoneFiles <- c(
        "ecozones.dbf", "ecozones.prj",
        "ecozones.sbn", "ecozones.sbx", "ecozones.shp", "ecozones.shx"
      )
      shpEcozoneSm <- Cache(prepInputs,
        url = shpUrl,
        targetFile = reproducible::asPath(ecozoneFilename),
        alsoExtract = reproducible::asPath(ecozoneFiles),
        studyArea = studyArea,
        fun = "terra::vect",
        destinationPath = dPath,
        writeTo = "EcozoneFile.shp"
      ) # passed to determineFilename

      terra::plot(shpEcozone[, 1])
      terra::plot(shpEcozoneSm[, 1], add = TRUE, col = "red")
      unlink(dPath)
    }
  }
}
}

## Using quoted dlFun and fun -- this is not intended to be run but used as a template
## prepInputs(..., fun = customFun(x = targetFile), customFun = customFun)
##   # or more complex
##  test5 <- prepInputs(
##   targetFile = targetFileLuxRDS,
##   dlFun =
##     getDataFn(name = "GADM", country = "LUX", level = 0) # preProcess keeps file from this!
##   ,
##   fun = {
##     out <- readRDS(targetFile)
##     sf::st_as_sf(out)}
##  )
}
\seealso{
\code{\link[=postProcessTo]{postProcessTo()}}, \code{\link[=downloadFile]{downloadFile()}}, \code{\link[=extractFromArchive]{extractFromArchive()}},
\code{\link[=postProcess]{postProcess()}}.
}
\author{
Eliot McIntire, Jean Marchal, and Tati Micheletti
}
