StorageResourceId.java
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.fs.gs;
import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.checkArgument;
import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.checkNotNull;
import static org.apache.hadoop.thirdparty.com.google.common.base.Strings.isNullOrEmpty;
import static org.apache.hadoop.fs.gs.Constants.SCHEME;
import com.google.cloud.storage.BlobId;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.URI;
import java.util.Objects;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Data struct representing either a GCS StorageObject, a GCS Bucket or the GCS root (gs://). If
* both bucketName and objectName are null, the StorageResourceId refers to GCS root (gs://). If
* bucketName is non-null, and objectName is null, then this refers to a GCS Bucket. Otherwise, if
* bucketName and objectName are both non-null, this refers to a GCS StorageObject.
*/
class StorageResourceId {
private static final Logger LOG = LoggerFactory.getLogger(StorageResourceId.class);
// The generationId used to denote "unknown"; if given to a method expecting generationId
// constraints, the method may perform extra low-level GETs to determine an existing
// generationId
// if idempotency constraints require doing so.
static final long UNKNOWN_GENERATION_ID = -1L;
// Pattern that parses out bucket and object names.
// Given 'gs://foo-bucket/foo/bar/baz', matcher.group(x) will return:
// 0 = gs://foo-bucket/foo/bar/baz
// 1 = foo-bucket/foo/bar/baz
// 2 = foo-bucket
// 3 = /foo/bar/baz
// 4 = foo/bar/baz
// Groups 2 and 4 can be used to create an instance.
private static final Pattern GCS_PATH_PATTERN = Pattern.compile("gs://(([^/]+)(/(.+)?)?)?");
// The singleton instance identifying the GCS root (gs://). Both getObjectName() and
// getBucketName() will return null.
static final StorageResourceId ROOT = new StorageResourceId();
// Bucket name of this storage resource to be used with the Google Cloud Storage API.
private final String bucketName;
// Object name of this storage resource to be used with the Google Cloud Storage API.
private final String objectName;
// Human-readable String to be returned by toString(); kept as 'final' member for efficiency.
private final String stringPath;
// The generationId to be used with precondition checks when using this StorageResourceId
// as an identifier for mutation requests.
private final long generationId;
/**
* Constructor for a StorageResourceId that refers to the GCS root (gs://). Private because all
* external users should just use the singleton StorageResourceId.ROOT.
*/
private StorageResourceId() {
this.bucketName = null;
this.objectName = null;
this.stringPath = StringPaths.fromComponents(bucketName, objectName);
this.generationId = UNKNOWN_GENERATION_ID;
}
/**
* Constructor for a StorageResourceId representing a Bucket; {@code getObjectName()} will return
* null for a StorageResourceId that represents a Bucket.
*
* @param bucketName The bucket name of the resource. Must be non-empty and non-null.
*/
StorageResourceId(String bucketName) {
checkArgument(!isNullOrEmpty(bucketName), "bucketName must not be null or empty");
this.bucketName = bucketName;
this.objectName = null;
this.stringPath = StringPaths.fromComponents(bucketName, objectName);
this.generationId = UNKNOWN_GENERATION_ID;
}
/**
* Constructor for a StorageResourceId representing a full StorageObject, including bucketName and
* objectName.
*
* @param bucketName The bucket name of the resource. Must be non-empty and non-null.
* @param objectName The object name of the resource. Must be non-empty and non-null.
*/
StorageResourceId(String bucketName, String objectName) {
checkArgument(!isNullOrEmpty(bucketName), "bucketName must not be null or empty");
checkArgument(!isNullOrEmpty(objectName), "objectName must not be null or empty");
this.bucketName = bucketName;
this.objectName = objectName;
this.stringPath = StringPaths.fromComponents(bucketName, objectName);
this.generationId = UNKNOWN_GENERATION_ID;
}
/**
* Constructor for a StorageResourceId representing a full StorageObject, including bucketName and
* objectName.
*
* @param bucketName The bucket name of the resource. Must be non-empty and non-null.
* @param objectName The object name of the resource. Must be non-empty and non-null.
* @param generationId The generationId to be used with precondition checks when using this
* StorageResourceId as an identifier for mutation requests.
*/
StorageResourceId(String bucketName, String objectName, long generationId) {
checkArgument(!isNullOrEmpty(bucketName), "bucketName must not be null or empty");
checkArgument(!isNullOrEmpty(objectName), "objectName must not be null or empty");
this.bucketName = bucketName;
this.objectName = objectName;
this.stringPath = StringPaths.fromComponents(bucketName, objectName);
this.generationId = generationId;
}
/**
* Constructor for a StorageResourceId representing a full StorageObject, including bucketName and
* objectName.
*
* @param bucketName The bucket name of the resource. Must be non-empty and non-null.
* @param generationId The generationId to be used with precondition checks when using this
* StorageResourceId as an identifier for mutation requests.
*/
StorageResourceId(String bucketName, long generationId) {
checkArgument(!isNullOrEmpty(bucketName), "bucketName must not be null or empty");
this.bucketName = bucketName;
this.objectName = null;
this.stringPath = StringPaths.fromComponents(bucketName, objectName);
this.generationId = generationId;
}
/**
* Returns true if this StorageResourceId represents a GCS StorageObject; if true, both {@code
* getBucketName} and {@code getObjectName} will be non-empty and non-null.
*/
boolean isStorageObject() {
return bucketName != null && objectName != null;
}
/**
* Returns true if this StorageResourceId represents a GCS Bucket; if true, then {@code
* getObjectName} will return null.
*/
boolean isBucket() {
return bucketName != null && objectName == null;
}
/**
* Returns true if this StorageResourceId represents the GCS root (gs://); if true, then both
* {@code getBucketName} and {@code getObjectName} will be null.
*/
boolean isRoot() {
return bucketName == null && objectName == null;
}
/**
* Indicates if this StorageResourceId corresponds to a 'directory'; similar to {@link
* FileInfo#isDirectory} except deals entirely with pathnames instead of also checking for
* exists() to be true on a corresponding GoogleCloudStorageItemInfo.
*/
boolean isDirectory() {
return isRoot() || isBucket() || StringPaths.isDirectoryPath(objectName);
}
/**
* Gets the bucket name component of this resource identifier.
*/
String getBucketName() {
return bucketName;
}
/**
* Gets the object name component of this resource identifier.
*/
String getObjectName() {
return objectName;
}
/**
* The generationId to be used with precondition checks when using this StorageResourceId as an
* identifier for mutation requests. The generationId is *not* used when determining equals() or
* hashCode().
*/
long getGenerationId() {
return generationId;
}
/**
* Returns true if generationId is not UNKNOWN_GENERATION_ID.
*/
boolean hasGenerationId() {
return generationId != UNKNOWN_GENERATION_ID;
}
/**
* Returns a string of the form {@code gs://<bucketName>/<objectName>}.
*/
@Override
public String toString() {
return stringPath;
}
@Override
public boolean equals(Object obj) {
if (obj instanceof StorageResourceId) {
StorageResourceId other = (StorageResourceId) obj;
return Objects.equals(bucketName, other.bucketName) && Objects.equals(objectName,
other.objectName);
}
return false;
}
@Override
public int hashCode() {
return stringPath.hashCode();
}
/**
* Converts StorageResourceId instance to look like a directory path. If the path already looks
* like a directory path then this call is a no-op.
*
* @return A resourceId with a directory path corresponding to the given resourceId.
*/
StorageResourceId toDirectoryId() {
if (isStorageObject() && !StringPaths.isDirectoryPath(getObjectName())) {
return new StorageResourceId(getBucketName(), StringPaths.toDirectoryPath(getObjectName()));
}
return this;
}
/**
* Parses {@link StorageResourceId} from specified string.
*/
static StorageResourceId fromStringPath(String path) {
return fromStringPath(path, UNKNOWN_GENERATION_ID);
}
/**
* Parses {@link StorageResourceId} from specified string and generationId.
*/
static StorageResourceId fromStringPath(String path, long generationId) {
checkArgument(path != null, "path must not be null");
Matcher matcher = GCS_PATH_PATTERN.matcher(path);
checkArgument(matcher.matches(), "'%s' is not a valid GCS object name.", path);
String bucketName = matcher.group(2);
String relativePath = matcher.group(4);
if (bucketName == null) {
checkArgument(generationId == UNKNOWN_GENERATION_ID,
"Cannot specify generationId '%s' for root object '%s'", generationId, path);
return ROOT;
} else if (relativePath != null) {
return new StorageResourceId(bucketName, relativePath, generationId);
}
checkArgument(generationId == UNKNOWN_GENERATION_ID,
"Cannot specify generationId '%s' for bucket '%s'", generationId, path);
return new StorageResourceId(bucketName);
}
/**
* Validates the given URI and if valid, returns the associated StorageResourceId.
*
* @param path The GCS URI to validate.
* @param allowEmptyObjectName If true, a missing object name is not considered invalid.
* @return a StorageResourceId that may be the GCS root, a Bucket, or a StorageObject.
*/
static StorageResourceId fromUriPath(URI path, boolean allowEmptyObjectName) {
return fromUriPath(path, allowEmptyObjectName, UNKNOWN_GENERATION_ID);
}
/**
* Validates the given URI and if valid, returns the associated StorageResourceId.
*
* @param path The GCS URI to validate.
* @param allowEmptyObjectName If true, a missing object name is not considered invalid.
* @param generationId The generationId to be used with precondition checks when
* using this
* @return a StorageResourceId that may be the GCS root, a Bucket, or a StorageObject.
*/
static StorageResourceId fromUriPath(URI path, boolean allowEmptyObjectName,
long generationId) {
LOG.trace("fromUriPath('{}', {})", path, allowEmptyObjectName);
checkNotNull(path);
if (!SCHEME.equals(path.getScheme())) {
throw new IllegalArgumentException(
String.format("GCS path supports only '%s' scheme, instead got '%s' from '%s'.", SCHEME,
path.getScheme(), path));
}
if (path.equals(GoogleCloudStorageFileSystem.GCSROOT)) {
return ROOT;
}
String bucketName = StringPaths.validateBucketName(path.getAuthority());
// Note that we're using getPath here instead of rawPath, etc. This is because it is assumed
// that the path was properly encoded in getPath (or another similar method):
String objectName = StringPaths.validateObjectName(path.getPath(), allowEmptyObjectName);
return isNullOrEmpty(objectName) ?
new StorageResourceId(bucketName, generationId) :
new StorageResourceId(bucketName, objectName, generationId);
}
BlobId toBlobId() {
return BlobId.of(bucketName, objectName);
}
}