PublicDatasetTestUtils.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.fs.s3a.test;

import org.junit.Assume;

import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.s3a.S3ATestConstants;
import org.apache.hadoop.fs.s3a.S3ATestUtils;

import static org.apache.hadoop.fs.s3a.S3ATestConstants.KEY_BUCKET_WITH_MANY_OBJECTS;
import static org.apache.hadoop.fs.s3a.S3ATestConstants.KEY_REQUESTER_PAYS_FILE;

/**
 * Provides S3A filesystem URIs for public data sets for specific use cases.
 *
 * This allows for the contract between S3A tests and the existence of data sets
 * to be explicit and also standardizes access and configuration of
 * replacements.
 *
 * Bucket specific configuration such as endpoint or requester pays should be
 * configured within "hadoop-tools/hadoop-aws/src/test/resources/core-site.xml".
 *
 * Warning: methods may mutate the configuration instance passed in.
 */
@InterfaceAudience.Private
@InterfaceStability.Unstable
public final class PublicDatasetTestUtils {

  /**
   * Private constructor for utility class.
   */
  private PublicDatasetTestUtils() {}

  /**
   * Default path for an object inside a requester pays bucket: {@value}.
   */
  private static final String DEFAULT_REQUESTER_PAYS_FILE
      = "s3a://usgs-landsat/collection02/catalog.json";

  /**
   * Default bucket name for the requester pays bucket.
   * Value = {@value}.
   */
  public static final String DEFAULT_REQUESTER_PAYS_BUCKET_NAME =
      "usgs-landsat";

  /**
   * Default bucket for an S3A file system with many objects: {@value}.
   *
   * We use a subdirectory to ensure we have permissions on all objects
   * contained within as well as permission to inspect the directory itself.
   */
  private static final String DEFAULT_BUCKET_WITH_MANY_OBJECTS
      = "s3a://usgs-landsat/collection02/level-1/";

  /**
   * ORC dataset: {@value}.
   */
  private static final Path ORC_DATA = new Path("s3a://osm-pds/planet/planet-latest.orc");

  /**
   * Provide a Path for some ORC data.
   *
   * @param conf Hadoop configuration
   * @return S3A FS URI
   */
  public static Path getOrcData(Configuration conf) {
    return ORC_DATA;
  }

  /**
   * Default path for the external test file: {@value}.
   * This must be: gzipped, large enough for the performance
   * tests and in a read-only bucket with anonymous access.
   * */
  public static final String DEFAULT_EXTERNAL_FILE =
      "s3a://noaa-cors-pds/raw/2023/017/ohfh/OHFH017d.23_.gz";

  /**
   * Get the external test file.
   * <p>
   * This must be: gzipped, large enough for the performance
   * tests and in a read-only bucket with anon
   * @param conf configuration
   * @return a dataset which meets the requirements.
   */
  public static Path getExternalData(Configuration conf) {
    return new Path(fetchFromConfig(conf,
        S3ATestConstants.KEY_CSVTEST_FILE, DEFAULT_EXTERNAL_FILE));
  }

  /**
   * Get the anonymous dataset..
   * @param conf configuration
   * @return a dataset which supports anonymous access.
   */
  public static Path requireAnonymousDataPath(Configuration conf) {
    return requireDefaultExternalData(conf);
  }


  /**
   * Get the external test file; assume() that it is not modified (i.e. we haven't
   * switched to a new storage infrastructure where the bucket is no longer
   * read only).
   * @return test file.
   * @param conf test configuration
   */
  public static String requireDefaultExternalDataFile(Configuration conf) {
    String filename = getExternalData(conf).toUri().toString();
    Assume.assumeTrue("External test file is not the default",
        DEFAULT_EXTERNAL_FILE.equals(filename));
    return filename;
  }

  /**
   * To determine whether {@value S3ATestConstants#KEY_CSVTEST_FILE} is configured to be
   * different from the default external file.
   *
   * @param conf Configuration object.
   * @return True if the default external data file is being used.
   */
  public static boolean isUsingDefaultExternalDataFile(final Configuration conf) {
    final String filename = getExternalData(conf).toUri().toString();
    return DEFAULT_EXTERNAL_FILE.equals(filename);
  }

  /**
   * Get the test external file; assume() that it is not modified (i.e. we haven't
   * switched to a new storage infrastructure where the bucket is no longer
   * read only).
   * @param conf test configuration
   * @return test file as a path.
   */
  public static Path requireDefaultExternalData(Configuration conf) {
    return new Path(requireDefaultExternalDataFile(conf));
  }

  /**
   * Provide a URI for a directory containing many objects.
   *
   * Unless otherwise configured,
   * this will be {@value DEFAULT_BUCKET_WITH_MANY_OBJECTS}.
   *
   * @param conf Hadoop configuration
   * @return S3A FS URI
   */
  public static String getBucketPrefixWithManyObjects(Configuration conf) {
    return fetchFromConfig(conf,
        KEY_BUCKET_WITH_MANY_OBJECTS, DEFAULT_BUCKET_WITH_MANY_OBJECTS);
  }

  /**
   * Provide a URI to an object within a requester pays enabled bucket.
   *
   * Unless otherwise configured,
   * this will be {@value DEFAULT_REQUESTER_PAYS_FILE}.
   *
   * @param conf Hadoop configuration
   * @return S3A FS URI
   */
  public static String getRequesterPaysObject(Configuration conf) {
    return fetchFromConfig(conf,
        KEY_REQUESTER_PAYS_FILE, DEFAULT_REQUESTER_PAYS_FILE);
  }

  /**
   * Fetch a trimmed configuration value, require it to to be non-empty.
   * @param conf configuration file
   * @param key key
   * @param defaultValue default value.
   * @return the resolved value.
   */
  private static String fetchFromConfig(Configuration conf, String key, String defaultValue) {
    String value = conf.getTrimmed(key, defaultValue);

    S3ATestUtils.assume("Empty test property: " + key, !value.isEmpty());

    return value;
  }

}