VectorSerializer.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.inference;
import java.nio.ByteBuffer;
import java.nio.FloatBuffer;
import java.util.Base64;
/**
* Serializes and deserializes float vectors as base64-encoded big-endian
* float32 byte arrays.
*
* <p>Big-endian matches the format expected by Elasticsearch's
* {@code dense_vector} field type, which accepts either a JSON float array
* or a base64-encoded binary string in big-endian float32 order.
* See the Elasticsearch dense_vector mapping documentation for details.
*/
public final class VectorSerializer {
private VectorSerializer() {
}
/**
* Encode a float array as a base64 string (big-endian float32).
*/
public static String encode(float[] vector) {
ByteBuffer buf = ByteBuffer.allocate(vector.length * Float.BYTES);
buf.asFloatBuffer().put(vector);
return Base64.getEncoder().encodeToString(buf.array());
}
/**
* Decode a base64 string back to a float array (big-endian float32).
*/
public static float[] decode(String base64) {
byte[] bytes = Base64.getDecoder().decode(base64);
FloatBuffer fb = ByteBuffer.wrap(bytes).asFloatBuffer();
float[] vector = new float[fb.remaining()];
fb.get(vector);
return vector;
}
}