diff --git a/compose.yaml b/compose.yaml index 31bc5c81b95..8e908975df5 100644 --- a/compose.yaml +++ b/compose.yaml @@ -441,7 +441,9 @@ services: ARROW_HOME: /arrow ARROW_DEPENDENCY_SOURCE: BUNDLED LIBARROW_MINIMAL: "false" - ARROW_MIMALLOC: "ON" + # explicitly enable GCS when we build libarrow so that binary libarrow + # users get more fully-featured builds + ARROW_GCS: "ON" volumes: *ubuntu-volumes command: &cpp-static-command /bin/bash -c " diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc index 2ab5e574e22..d28639c34aa 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc @@ -277,8 +277,16 @@ template concept CBooleanConcept = std::same_as; // XXX: Ideally we want to have std::floating_point = true. +// Some older standard library implementations (e.g., macOS 11.x libc++) have partial +// C++20 concepts support with std::same_as but lack std::floating_point. +#if defined(__cpp_lib_concepts) && __cpp_lib_concepts >= 202002L template concept CFloatingPointConcept = std::floating_point || std::same_as; +#else +template +concept CFloatingPointConcept = + std::is_floating_point_v || std::same_as; +#endif template concept CDecimalConcept = std::same_as || std::same_as || diff --git a/dev/tasks/r/github.macos.cran.yml b/dev/tasks/r/github.macos.cran.yml index dda8ac7fd78..930f7c5587e 100644 --- a/dev/tasks/r/github.macos.cran.yml +++ b/dev/tasks/r/github.macos.cran.yml @@ -21,10 +21,12 @@ jobs: macos-cran: - name: "macOS similar to CRAN" + name: "macOS {{ '${{ matrix.config }}' }}" runs-on: macOS-latest strategy: fail-fast: false + matrix: + config: ["cran-m1", "cran-release"] steps: {{ macros.github_checkout_arrow()|indent }} @@ -58,7 +60,35 @@ jobs: extra-packages: | any::rcmdcheck any::sys - - name: Install + - name: Install MacOSX 11.3 SDK + if: matrix.config == 'cran-release' + env: + SDK_TOKEN: {{ '${{ secrets.JONKEANE_MACOS_11_SDK_DOWNLOAD_TOKEN }}' }} + run: | + # Download, Confirm integrity, expand. This will fail if the hash does not match. + curl -fsSL -H "Authorization: Bearer $SDK_TOKEN" \ + -H "Accept: application/vnd.github+json" \ + https://api.github.com/repos/jonkeane/crossbow_11_sdk/tarball/v0.0.1 \ + -o /tmp/MacOSX11.3.sdk.tar.gz + echo "493570e56d6c6af26128e9096de738822589cc3cdb1b29aa5854f3f4c99756ac /tmp/MacOSX11.3.sdk.tar.gz" | shasum -a 256 -c - + sudo tar -xzf /tmp/MacOSX11.3.sdk.tar.gz -C /Library/Developer/CommandLineTools/SDKs/ + # Move SDK from extracted folder (GitHub archives as {owner}-{repo}-{sha}/) + sudo mv /Library/Developer/CommandLineTools/SDKs/jonkeane-crossbow_11_sdk-*/MacOSX11.3.sdk \ + /Library/Developer/CommandLineTools/SDKs/MacOSX11.3.sdk + sudo rm -rf /Library/Developer/CommandLineTools/SDKs/jonkeane-crossbow_11_sdk-* + ls -la /Library/Developer/CommandLineTools/SDKs/ + - name: Install (cran-release) + if: matrix.config == 'cran-release' + env: + _R_CHECK_CRAN_INCOMING_: false + SDKROOT: '/Library/Developer/CommandLineTools/SDKs/MacOSX11.3.sdk' + NOT_CRAN: false + run: | + sccache --start-server || echo 'sccache not found' + cd arrow/r + R CMD INSTALL . --install-tests + - name: Install (cran-m1) + if: matrix.config == 'cran-m1' env: _R_CHECK_CRAN_INCOMING_: false CXX: "clang++ -mmacos-version-min=14.6" @@ -77,6 +107,6 @@ jobs: - name: Save the test output uses: actions/upload-artifact@v4 with: - name: test-output + name: test-output-{{ '${{ matrix.config }}' }} path: arrow-tests/testthat.Rout* if: always() diff --git a/dev/tasks/r/github.packages.yml b/dev/tasks/r/github.packages.yml index cedb567f2cd..40d34572922 100644 --- a/dev/tasks/r/github.packages.yml +++ b/dev/tasks/r/github.packages.yml @@ -81,7 +81,6 @@ jobs: env: {{ macros.github_set_sccache_envvars()|indent(8) }} MACOSX_DEPLOYMENT_TARGET: "11.6" - ARROW_S3: ON ARROW_GCS: ON ARROW_DEPENDENCY_SOURCE: BUNDLED CMAKE_GENERATOR: Ninja diff --git a/r/NEWS.md b/r/NEWS.md index abfafffb2e2..9b274a8482e 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -18,6 +18,29 @@ --> # arrow 23.0.1 + +## Minor improvements and fixes + +- Fix C++20 compatibility issue on macOS (#49221). +- Turn off GCS support by default on macOS; see `vignette("install", package = "arrow")` for details on enabling it (#49068, #48995). + +# arrow 23.0.0 + +## New features + +- `nchar()` now supports `keepNA = FALSE` (@HyukjinKwon, #48665). +- `stringr::str_ilike()` binding for case-insensitive pattern matching (#48262). + +## Minor improvements and fixes + +- Fix slow performance reading files with large number of columns (#48104). +- Fix segfault when calling `concat_tables()` on a `RecordBatch` (#47885). +- Writing partitioned datasets on S3 no longer requires `ListBucket` permissions (@HaochengLIU, #47599). + +## Installation + +- As of version 23.0.0, `arrow` requires C++20 to build from source. This means that you may need a newer compiler than the default on some older systems. See `vignette("install", package = "arrow")` for guidance. + # arrow 22.0.0.1 ## Minor improvements and fixes diff --git a/r/README.md b/r/README.md index bb5d137dc88..d296143d276 100644 --- a/r/README.md +++ b/r/README.md @@ -1,13 +1,5 @@ # arrow - - -[![cran](https://www.r-pkg.org/badges/version-last-release/arrow)](https://cran.r-project.org/package=arrow) -[![CI](https://github.com/apache/arrow/actions/workflows/r.yml/badge.svg?branch=main&event=push)](https://github.com/apache/arrow/actions/workflows/r.yml?query=branch%3Amain+event%3Apush) -[![R-universe status badge](https://apache.r-universe.dev/badges/arrow)](https://apache.r-universe.dev) -[![conda-forge](https://img.shields.io/conda/vn/conda-forge/r-arrow.svg)](https://anaconda.org/conda-forge/r-arrow) - - ## Overview @@ -64,7 +56,7 @@ It allows users to read and write data in a variety of formats: It provides access to remote filesystems and servers: -- Read and write files in Amazon S3 and Google Cloud Storage buckets +- Read and write files in Amazon S3 and Google Cloud Storage buckets (note: CRAN builds include S3 support but not GCS which require an alternative installation method; see the [cloud storage article](https://arrow.apache.org/docs/r/articles/fs.html) for details) - Connect to Arrow Flight servers to transport large datasets over networks Additional features include: diff --git a/r/inst/NOTICE.txt b/r/inst/NOTICE.txt index 2089c6fb203..8046f20a0b9 100644 --- a/r/inst/NOTICE.txt +++ b/r/inst/NOTICE.txt @@ -1,5 +1,5 @@ Apache Arrow -Copyright 2016-2024 The Apache Software Foundation +Copyright 2016-2026 The Apache Software Foundation This product includes software developed at The Apache Software Foundation (http://www.apache.org/). @@ -17,9 +17,6 @@ https://github.com/libdynd This product includes software from the LLVM project * distributed under the University of Illinois Open Source -This product includes software from the google-lint project - * Copyright (c) 2009 Google Inc. All rights reserved. - This product includes software from the mman-win32 project * Copyright https://code.google.com/p/mman-win32/ * Licensed under the MIT License; diff --git a/r/man/DictionaryType.Rd b/r/man/DictionaryType.Rd index 8c9087f1ab6..cda27978b1b 100644 --- a/r/man/DictionaryType.Rd +++ b/r/man/DictionaryType.Rd @@ -3,13 +3,40 @@ \docType{class} \name{DictionaryType} \alias{DictionaryType} -\title{class DictionaryType} +\title{DictionaryType class} \description{ -class DictionaryType +\code{DictionaryType} is a \link{FixedWidthType} that represents dictionary-encoded data. +Dictionary encoding stores unique values in a dictionary and uses integer-type +indices to reference them, which can be more memory-efficient for data with many +repeated values. } -\section{Methods}{ +\section{R6 Methods}{ +\itemize{ +\item \verb{$ToString()}: Return a string representation of the dictionary type +\item \verb{$code(namespace = FALSE)}: Return R code to create this dictionary type +} +} + +\section{Active bindings}{ -TODO +\itemize{ +\item \verb{$index_type}: The \link{DataType} for the dictionary indices (must be an integer type, +signed or unsigned) +\item \verb{$value_type}: The \link{DataType} for the dictionary values +\item \verb{$name}: The name of the type. +\item \verb{$ordered}: Whether the dictionary is ordered. +} +} + +\section{Factory}{ + + +\code{DictionaryType$create()} takes the following arguments: +\itemize{ +\item \code{index_type}: A \link{DataType} for the indices (default \code{\link[=int32]{int32()}}) +\item \code{value_type}: A \link{DataType} for the values (default \code{\link[=utf8]{utf8()}}) +\item \code{ordered}: Is this an ordered dictionary (default \code{FALSE})? +} } diff --git a/r/man/FixedWidthType.Rd b/r/man/FixedWidthType.Rd index ac6723d79db..71d0ab2d276 100644 --- a/r/man/FixedWidthType.Rd +++ b/r/man/FixedWidthType.Rd @@ -5,11 +5,22 @@ \alias{FixedWidthType} \title{FixedWidthType class} \description{ -FixedWidthType class +\code{FixedWidthType} is a base class for data types with a fixed width in bits. +This includes all integer types, floating-point types, \code{Boolean}, +\code{FixedSizeBinary}, temporal types (dates, times, timestamps, durations), +and decimal types. } -\section{Methods}{ +\section{R6 Methods}{ -TODO +\code{FixedWidthType} inherits from \link{DataType}, so it has the same methods. } +\section{Active bindings}{ + +\itemize{ +\item \verb{$bit_width}: The width of the type in bits +} +} + +\keyword{internal} diff --git a/r/man/Message.Rd b/r/man/Message.Rd index fbad235b64f..b8be82bfa4b 100644 --- a/r/man/Message.Rd +++ b/r/man/Message.Rd @@ -5,11 +5,24 @@ \alias{Message} \title{Message class} \description{ -Message class +\code{Message} holds an Arrow IPC message, which includes metadata and +an optional message body. } -\section{Methods}{ +\section{R6 Methods}{ +\itemize{ +\item \verb{$Equals(other)}: Check if this \code{Message} is equal to another \code{Message} +\item \verb{$body_length()}: Return the length of the message body in bytes +\item \verb{$Verify()}: Check if the \code{Message} metadata is valid Flatbuffer format +} +} -TODO +\section{Active bindings}{ + +\itemize{ +\item \verb{$type}: The message type +\item \verb{$metadata}: The message metadata +\item \verb{$body}: The message body as a \link{Buffer} +} } diff --git a/r/man/MessageReader.Rd b/r/man/MessageReader.Rd index 32ca8900b33..4c3bef3fc9f 100644 --- a/r/man/MessageReader.Rd +++ b/r/man/MessageReader.Rd @@ -5,11 +5,22 @@ \alias{MessageReader} \title{MessageReader class} \description{ -MessageReader class +\code{MessageReader} reads \code{Message} objects from an input stream. } -\section{Methods}{ +\section{R6 Methods}{ +\itemize{ +\item \verb{$ReadNextMessage()}: Read the next \code{Message} from the stream. Returns \code{NULL} if +there are no more messages. +} +} + +\section{Factory}{ -TODO + +\code{MessageReader$create()} takes the following argument: +\itemize{ +\item \code{stream}: An \link{InputStream} or object coercible to one (e.g., a raw vector) +} } diff --git a/r/man/acero.Rd b/r/man/acero.Rd index dcaca04d2f2..d99cf14fe96 100644 --- a/r/man/acero.Rd +++ b/r/man/acero.Rd @@ -198,7 +198,7 @@ Valid values are "s", "ms" (default), "us", "ns". \itemize{ \item \code{\link[dplyr:across]{across()}} \item \code{\link[dplyr:between]{between()}} -\item \code{\link[dplyr:case_when]{case_when()}}: \code{.ptype} and \code{.size} arguments not supported +\item \code{\link[dplyr:case-and-replace-when]{case_when()}}: \code{.ptype} and \code{.size} arguments not supported \item \code{\link[dplyr:coalesce]{coalesce()}} \item \code{\link[dplyr:desc]{desc()}} \item \code{\link[dplyr:across]{if_all()}} diff --git a/r/tools/checksums/r-libarrow-darwin-arm64-23.0.1.zip.sha512 b/r/tools/checksums/r-libarrow-darwin-arm64-23.0.1.zip.sha512 new file mode 100644 index 00000000000..f81536f77ff --- /dev/null +++ b/r/tools/checksums/r-libarrow-darwin-arm64-23.0.1.zip.sha512 @@ -0,0 +1 @@ +3d1156bac8ed2939f3541cbfdb6da0d2a33aa1cc9d8c54f4046532fe5bb5d436ff67d2ab8f47ca6efd2169efad9de6f9451745913eca952239265c11c1e046b4 r-libarrow-darwin-arm64-23.0.1.zip diff --git a/r/tools/checksums/r-libarrow-darwin-x86_64-23.0.1.zip.sha512 b/r/tools/checksums/r-libarrow-darwin-x86_64-23.0.1.zip.sha512 new file mode 100644 index 00000000000..850d62ec018 --- /dev/null +++ b/r/tools/checksums/r-libarrow-darwin-x86_64-23.0.1.zip.sha512 @@ -0,0 +1 @@ +d4daf5ea72c0abcee74b8726c7a96450025f7a6662e4ba302e74c00c0bf1d547ce7cd97a2708f3a577396cb3127533bbbd986dfaa572696dd360ba031e4602f9 r-libarrow-darwin-x86_64-23.0.1.zip diff --git a/r/tools/checksums/r-libarrow-linux-x86_64-23.0.1.zip.sha512 b/r/tools/checksums/r-libarrow-linux-x86_64-23.0.1.zip.sha512 new file mode 100644 index 00000000000..36b3f32360b --- /dev/null +++ b/r/tools/checksums/r-libarrow-linux-x86_64-23.0.1.zip.sha512 @@ -0,0 +1 @@ +3b39b288d9e280d2fa4dce1bd36d46f7981004448bcb7c4d16002bb25cecc2a31bf334383159364687a66379468c8bc81982a6e087593fd3d2daf0651d7657e8 r-libarrow-linux-x86_64-23.0.1.zip diff --git a/r/tools/checksums/r-libarrow-windows-x86_64-23.0.1.zip.sha512 b/r/tools/checksums/r-libarrow-windows-x86_64-23.0.1.zip.sha512 new file mode 100644 index 00000000000..e5b56742bb2 --- /dev/null +++ b/r/tools/checksums/r-libarrow-windows-x86_64-23.0.1.zip.sha512 @@ -0,0 +1 @@ +4af0fd6f280f4271131ce700a6edb52651cb5dbcfe740ecbc47fa851bfdcc779c834a049305b33521f6144952680141118a7f387b941c2c4590c46ce86909cb1 r-libarrow-windows-x86_64-23.0.1.zip diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index f4ccb4956a8..151dd47f5dd 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -597,7 +597,7 @@ build_libarrow <- function(src_dir, dst_dir) { env_var_list <- c( env_var_list, ARROW_S3 = Sys.getenv("ARROW_S3", "ON"), - ARROW_GCS = Sys.getenv("ARROW_GCS", "ON"), + # ARROW_GCS = Sys.getenv("ARROW_GCS", "ON"), ARROW_WITH_ZSTD = Sys.getenv("ARROW_WITH_ZSTD", "ON") ) } diff --git a/r/vignettes/developers/binary_features.Rmd b/r/vignettes/developers/binary_features.Rmd new file mode 100644 index 00000000000..ed6c7180f5b --- /dev/null +++ b/r/vignettes/developers/binary_features.Rmd @@ -0,0 +1,193 @@ +--- +title: "Libarrow binary features" +description: > + Understanding which C++ features are enabled in Arrow R package builds +output: rmarkdown::html_vignette +--- + +This document explains which C++ features are enabled in different Arrow R +package build configurations, and documents the decisions behind our default +feature set. This is intended as internal developer documentation for understanding +which features are enabled in which builds. It is not intended to be a guide for +installing the Arrow R package; for that, see the +[installation guide](../../install.html). + +## Overview + +When the Arrow R package is installed, it needs a copy of the Arrow C++ library +(libarrow). This can come from: + +1. **Prebuilt binaries** we host (for releases and nightlies) +2. **Source builds** when binaries aren't available or users opt out + +The features available in libarrow depend on how it was built. This document +covers the feature configuration for both scenarios. + +## Prebuilt libarrow binary configuration + +We produce prebuilt libarrow binaries for macOS, Windows, and Linux. These +binaries include **more features** than the default source build to provide +users with a fully-featured experience out of the box. + +### Current binary feature set + +| Platform | S3 | GCS | Configured in | +|----------|----|----|---------------| +| macOS (ARM64, x86_64) | ON | ON | `dev/tasks/r/github.packages.yml` | +| Windows | ON | ON | `ci/scripts/PKGBUILD` | +| Linux (x86_64) | ON | ON | `compose.yaml` (`ubuntu-cpp-static`) | + +### Exceptions to our build defaults + +Even though GCS defaults to OFF for source builds, we explicitly enable it in +our prebuilt binaries because: + +1. **Binary users expect features to "just work"** - they shouldn't need to + rebuild from source to access cloud storage +2. **Build time is not a concern** - we build binaries once in CI, not on + user machines +3. **Parity across platforms** - users get the same features regardless of OS + + +## Feature configuration in source builds of libarrow + +Source builds are controlled by `r/inst/build_arrow_static.sh`. The key +environment variable is `LIBARROW_MINIMAL`: + +- `LIBARROW_MINIMAL` unset: Default feature set (Parquet, Dataset, JSON, common compression ON; S3/GCS/jemalloc OFF) +- `LIBARROW_MINIMAL=false`: Full feature set (adds S3, jemalloc, additional compression) +- `LIBARROW_MINIMAL=true`: Truly minimal (disables Parquet, Dataset, JSON, most compression, SIMD) + +### Features always enabled + +These features are always built regardless of `LIBARROW_MINIMAL`: + +| Feature | CMake Flag | Notes | +|---------|------------|-------| +| Compute | `ARROW_COMPUTE=ON` | Core compute functions | +| CSV | `ARROW_CSV=ON` | CSV reading/writing | +| Filesystem | `ARROW_FILESYSTEM=ON` | Local filesystem support | +| JSON | `ARROW_JSON=ON` | JSON reading | +| Parquet | `ARROW_PARQUET=ON` | Parquet file format | +| Dataset | `ARROW_DATASET=ON` | Multi-file datasets | +| Acero | `ARROW_ACERO=ON` | Query execution engine | +| Mimalloc | `ARROW_MIMALLOC=ON` | Memory allocator | +| LZ4 | `ARROW_WITH_LZ4=ON` | LZ4 compression | +| Snappy | `ARROW_WITH_SNAPPY=ON` | Snappy compression | +| RE2 | `ARROW_WITH_RE2=ON` | Regular expressions | +| UTF8Proc | `ARROW_WITH_UTF8PROC=ON` | Unicode support | + +### Features controlled by LIBARROW_MINIMAL + +When `LIBARROW_MINIMAL=false`, the following additional features are enabled +(via `$ARROW_DEFAULT_PARAM=ON`): + +| Feature | CMake Flag | Default | +|---------|------------|---------| +| S3 | `ARROW_S3` | `$ARROW_DEFAULT_PARAM` | +| Jemalloc | `ARROW_JEMALLOC` | `$ARROW_DEFAULT_PARAM` | +| Brotli | `ARROW_WITH_BROTLI` | `$ARROW_DEFAULT_PARAM` | +| BZ2 | `ARROW_WITH_BZ2` | `$ARROW_DEFAULT_PARAM` | +| Zlib | `ARROW_WITH_ZLIB` | `$ARROW_DEFAULT_PARAM` | +| Zstd | `ARROW_WITH_ZSTD` | `$ARROW_DEFAULT_PARAM` | + +### Features that require explicit opt-in + +GCS (Google Cloud Storage) is **always off by default**, even when +`LIBARROW_MINIMAL=false`: + +| Feature | CMake Flag | Default | Reason | +|---------|------------|---------|--------| +| GCS | `ARROW_GCS` | `OFF` | Build complexity, dependency size | + +To enable GCS in a source build, you must explicitly set `ARROW_GCS=ON`. + +**Why is GCS off by default?** + +GCS was turned off by default in [#48343](https://github.com/apache/arrow/pull/48343) +(December 2025) because: + +1. Building google-cloud-cpp is fragile and adds significant build time +2. The dependency on abseil (ABSL) has caused compatibility issues +3. Users who need GCS can still enable it explicitly + +## Configuration file locations + +### libarrow source build configuration + +The main build script that controls source builds: + +**`r/inst/build_arrow_static.sh`** - CMake flags and defaults +([view source](https://github.com/apache/arrow/blob/main/r/inst/build_arrow_static.sh)) +the environment variables to look for are `LIBARROW_MINIMAL`, `ARROW_*`, and, `ARROW_DEFAULT_PARAM` + +### libarrow binary build configuration + +Each platform has its own configuration file: + +| Platform | Config file | Key settings | +|----------|-------------|--------------| +| macOS | `dev/tasks/r/github.packages.yml` | `LIBARROW_MINIMAL=false`, `ARROW_GCS=ON` | +| Windows | `ci/scripts/PKGBUILD` | `ARROW_GCS=ON`, `ARROW_S3=ON` | +| Linux | `compose.yaml` (`ubuntu-cpp-static`) | `LIBARROW_MINIMAL=false`, `ARROW_GCS=ON` | + +## R-universe builds + +[R-universe](https://apache.r-universe.dev/arrow) builds the Arrow R package +for users who want newer versions than CRAN. R-universe behavior varies by +platform and architecture: + +| Platform | Architecture | Build method | Features | +|----------|--------------|--------------|----------| +| macOS | ARM64 | Downloads prebuilt binary | Full (S3 + GCS) | +| macOS | x86_64 | Downloads prebuilt binary | Full (S3 + GCS) | +| Windows | x86_64 | Downloads prebuilt binary | Full (S3 + GCS) | +| Windows | ARM64 | Not supported | NA | +| Linux | x86_64 | Downloads prebuilt binary | Full (S3 + GCS) | +| Linux | ARM64 | Builds from source | S3 only (no GCS) | + +### Why Linux ARM64 builds from source + +We only publish prebuilt Linux binaries for x86_64 architecture. The binary +selection logic in `r/tools/nixlibs.R` (line 263) explicitly checks for this: + +```r +if (identical(os, "darwin") || (identical(os, "linux") && identical(arch, "x86_64"))) { +``` +When R-universe builds on Linux ARM64 runners, no binary is available, so it +falls back to building from source using `build_arrow_static.sh`. Since GCS +defaults to OFF in that script, Linux ARM64 users don't get GCS support. + +### Enabling GCS for Linux ARM64 + +To provide full feature parity for Linux ARM64, we would need to: + +1. Add an ARM64 Linux build job to `dev/tasks/r/github.packages.yml` +2. Update `select_binary()` in `nixlibs.R` to recognize `linux-aarch64` +3. Add the artifact pattern to `dev/tasks/tasks.yml` +4. Update the nightly upload workflow + +See [GH-36193](https://github.com/apache/arrow/issues/36193) for tracking this work. + +Alternatively, changing the GCS default in `build_arrow_static.sh` from `OFF` +to `$ARROW_DEFAULT_PARAM` would enable GCS for all source builds, including +Linux ARM64 on R-universe. + +## Checking installed features + +Users can check which features are enabled in their installation: + +```r +# Show all capabilities +arrow::arrow_info() + +# Check specific features +arrow::arrow_with_s3() +arrow::arrow_with_gcs() +``` + +## Related documentation + +- [Installation guide](../install.html) - User-facing installation docs +- [Installation details](./install_details.html) - How the build system works +- [Developer setup](./setup.html) - Building Arrow for development diff --git a/r/vignettes/fs.Rmd b/r/vignettes/fs.Rmd index ed3b1bddb03..52652ad7e9e 100644 --- a/r/vignettes/fs.Rmd +++ b/r/vignettes/fs.Rmd @@ -12,9 +12,9 @@ To make this work, the Arrow C++ library contains a general-purpose interface fo This article provides an overview of working with both S3 and GCS data using the Arrow toolkit. -## S3 and GCS support on Linux +## S3 and GCS support -Before you start, make sure that your arrow install has support for S3 and/or GCS enabled. For most users this will be true by default, because the Windows and macOS binary packages hosted on CRAN include S3 and GCS support. You can check whether support is enabled via helper functions: +Before you start, make sure that your arrow installation has support for S3 and/or GCS enabled. You can check whether support is enabled via helper functions: ```r arrow_with_s3() @@ -23,7 +23,20 @@ arrow_with_gcs() If these return `TRUE` then the relevant support is enabled. -In some cases you may find that your system does not have support enabled. The most common case for this occurs on Linux when installing arrow from source. In this situation S3 and GCS support is not always enabled by default, and there are additional system requirements involved. See the [installation article](./install.html) for details on how to resolve this. +CRAN builds of arrow include S3 support but not GCS support. If you need GCS support, you can install arrow with full features using one of the following methods: + +```r +# Option 1: Install from R-universe +install.packages("arrow", repos = "https://apache.r-universe.dev") +``` + +```r +# Option 2: Reinstall from source with full features +Sys.setenv("NOT_CRAN" = "true") +install.packages("arrow", type = "source") +``` + +On Linux, S3 and GCS support is not always enabled by default when installing from source, and there are additional system requirements involved. See the [installation article](./install.html) for details. ## Connecting to cloud storage diff --git a/r/vignettes/install.Rmd b/r/vignettes/install.Rmd index d9cdcc3885c..14e6622e043 100644 --- a/r/vignettes/install.Rmd +++ b/r/vignettes/install.Rmd @@ -8,6 +8,8 @@ output: rmarkdown::html_vignette In most cases, `install.packages("arrow")` should just work. There are things you can do to make the installation faster, documented in this article. If for some reason installation does not work, set the environment variable `ARROW_R_DEV=true`, retry, and share the logs with us. +Note that CRAN builds of arrow have some optional features disabled, including Google Cloud Storage (GCS) support. If you need these features, see the information below on [building with a libarrow binary](#r-source-package-with-libarrow-binary), or the [cloud storage article](./fs.html#s3-and-gcs-support) for alternative installation options. + ## Background The Apache Arrow project is implemented in multiple languages, and the R package depends on the Arrow C++ library (referred to from here on as libarrow). This means that when you install arrow, you need both the R and C++ versions. If you install arrow from CRAN on a machine running Windows or macOS, when you call `install.packages("arrow")`, a precompiled binary containing both the R package and libarrow will be downloaded. However, CRAN does not host R package binaries for Linux, and so you must choose from one of the alternative approaches.