DataPackage.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.pipes.core.extractor.frictionless;
import java.io.IOException;
import java.io.OutputStream;
import java.time.Instant;
import java.time.ZoneOffset;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonPropertyOrder;
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
/**
* Represents a Frictionless Data Package manifest (datapackage.json).
* See: https://specs.frictionlessdata.io/data-package/
*
* The Data Package format is a simple standard for packaging data with metadata.
* This implementation includes:
* - name: identifier for the package (typically the container filename)
* - created: ISO 8601 timestamp of when the package was created
* - resources: list of data files with path, mediatype, bytes, and hash
*/
@JsonInclude(JsonInclude.Include.NON_NULL)
@JsonPropertyOrder({"name", "created", "title", "description", "resources"})
public class DataPackage {
private static final ObjectMapper MAPPER = new ObjectMapper()
.enable(SerializationFeature.INDENT_OUTPUT)
.configure(JsonGenerator.Feature.AUTO_CLOSE_TARGET, false);
private static final DateTimeFormatter ISO_FORMATTER =
DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss'Z'", java.util.Locale.ROOT)
.withZone(ZoneOffset.UTC);
@JsonProperty("name")
private String name;
@JsonProperty("created")
private String created;
@JsonProperty("title")
private String title;
@JsonProperty("description")
private String description;
@JsonProperty("resources")
private List<FrictionlessResource> resources;
/**
* Creates a new DataPackage with the given name and current timestamp.
*
* @param name the package name (typically container filename)
*/
public DataPackage(String name) {
this.name = sanitizeName(name);
this.created = ISO_FORMATTER.format(Instant.now());
this.resources = new ArrayList<>();
}
/**
* Creates an empty DataPackage for deserialization.
*/
public DataPackage() {
this.resources = new ArrayList<>();
}
/**
* Sanitizes the name to be a valid Frictionless package name.
* Package names should be lowercase with no spaces.
*
* @param name the raw name
* @return sanitized name
*/
private static String sanitizeName(String name) {
if (name == null) {
return "unknown";
}
// Replace spaces, keep alphanumeric, dots, hyphens, underscores
return name.toLowerCase(java.util.Locale.ROOT)
.replaceAll("[^a-z0-9._-]", "-")
.replaceAll("-+", "-");
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = sanitizeName(name);
}
public String getCreated() {
return created;
}
public void setCreated(String created) {
this.created = created;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getDescription() {
return description;
}
public void setDescription(String description) {
this.description = description;
}
public List<FrictionlessResource> getResources() {
return resources;
}
public void setResources(List<FrictionlessResource> resources) {
this.resources = resources;
}
/**
* Adds a resource to this data package.
*
* @param resource the resource to add
*/
public void addResource(FrictionlessResource resource) {
this.resources.add(resource);
}
/**
* Adds a resource to this data package with all parameters.
*
* @param path relative path within package
* @param mediatype MIME type
* @param bytes file size
* @param hash SHA256 hash with "sha256:" prefix
* @param name optional original filename
*/
public void addResource(String path, String mediatype, long bytes, String hash, String name) {
this.resources.add(FrictionlessResource.create(path, mediatype, bytes, hash, name));
}
/**
* Serializes this DataPackage to JSON.
*
* @return JSON string representation
* @throws IOException if serialization fails
*/
public String toJson() throws IOException {
return MAPPER.writeValueAsString(this);
}
/**
* Writes this DataPackage as JSON to the given output stream.
* Does not close the output stream.
*
* @param outputStream the stream to write to
* @throws IOException if serialization fails
*/
public void writeTo(OutputStream outputStream) throws IOException {
MAPPER.writeValue(outputStream, this);
}
/**
* Parses a DataPackage from JSON string.
*
* @param json the JSON string
* @return parsed DataPackage
* @throws IOException if parsing fails
*/
public static DataPackage fromJson(String json) throws IOException {
return MAPPER.readValue(json, DataPackage.class);
}
/**
* Returns true if this package has any resources.
*
* @return true if resources list is not empty
*/
public boolean hasResources() {
return resources != null && !resources.isEmpty();
}
/**
* Returns the number of resources in this package.
*
* @return resource count
*/
public int resourceCount() {
return resources != null ? resources.size() : 0;
}
@Override
public String toString() {
return "DataPackage{" +
"name='" + name + '\'' +
", created='" + created + '\'' +
", resources=" + (resources != null ? resources.size() : 0) + " items" +
'}';
}
}