Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.fory.benchmark;

import java.util.Arrays;
import org.apache.fory.format.annotation.ForyVersion;
import org.apache.fory.format.encoder.Encoders;
import org.apache.fory.format.encoder.RowEncoder;
import org.apache.fory.logging.Logger;
import org.apache.fory.logging.LoggerFactory;
import org.openjdk.jmh.Main;
import org.openjdk.jmh.annotations.Benchmark;

/**
* Row-codec schema-evolution throughput and allocation. Pair with the JMH gc profiler ({@code -prof
* gc}) to read {@code gc.alloc.rate.norm} (bytes per op). Two comparisons matter: {@code
* currentDecode} vs {@code olderDecode} shows that decoding an older payload through a projection
* codec allocates no more than decoding the current schema, because each projection holds its
* historical schema's row layout (no per-decode rebuild); and the {@code *NoEvolution} benchmarks
* vs their evolution-on counterparts show the steady-state cost of enabling {@code
* withSchemaEvolution()} when reading and writing current-version data.
*/
public class SchemaEvolutionSuite {
private static final Logger LOG = LoggerFactory.getLogger(SchemaEvolutionSuite.class);

public static class PersonV1 {
String name;
int age;
}

public static class PersonV2 {
String name;
int age;

@ForyVersion(since = 2)
String email;
}

// Evolution-enabled codecs for the current (V2) schema; the V1 codec only produces a payload
// whose hash routes the V2 reader onto its projection path. Both standard and compact formats
// are measured: compact is where a per-projection cached row layout matters, so olderDecode vs
// currentDecode there is the parity check.
private static final RowEncoder<PersonV1> v1Codec =
Encoders.buildBeanCodec(PersonV1.class).withSchemaEvolution().build().get();
private static final RowEncoder<PersonV2> v2Codec =
Encoders.buildBeanCodec(PersonV2.class).withSchemaEvolution().build().get();
private static final RowEncoder<PersonV1> v1CompactCodec =
Encoders.buildBeanCodec(PersonV1.class).compactEncoding().withSchemaEvolution().build().get();
private static final RowEncoder<PersonV2> v2CompactCodec =
Encoders.buildBeanCodec(PersonV2.class).compactEncoding().withSchemaEvolution().build().get();

// Evolution-disabled codecs for the same current (V2) schema. Comparing the *NoEvolution
// benchmarks against their evolution-on counterparts isolates the steady-state cost of the
// withSchemaEvolution() flag on the common path (reading and writing current-version data): the
// 8-byte hash slot the evolution wire format adds, plus the hash compare on decode.
private static final RowEncoder<PersonV2> v2PlainCodec =
Encoders.buildBeanCodec(PersonV2.class).build().get();
private static final RowEncoder<PersonV2> v2PlainCompactCodec =
Encoders.buildBeanCodec(PersonV2.class).compactEncoding().build().get();

private static final PersonV2 person = newPerson();
private static final byte[] currentBytes = v2Codec.encode(person);
private static final byte[] olderBytes = v1Codec.encode(newPersonV1());
private static final byte[] currentCompactBytes = v2CompactCodec.encode(person);
private static final byte[] olderCompactBytes = v1CompactCodec.encode(newPersonV1());
private static final byte[] plainBytes = v2PlainCodec.encode(person);
private static final byte[] plainCompactBytes = v2PlainCompactCodec.encode(person);

private static PersonV2 newPerson() {
PersonV2 p = new PersonV2();
p.name = "Ada Lovelace";
p.age = 36;
p.email = "ada@example.com";
return p;
}

private static PersonV1 newPersonV1() {
PersonV1 p = new PersonV1();
p.name = "Ada Lovelace";
p.age = 36;
return p;
}

@Benchmark
public Object encode() {
return v2Codec.encode(person);
}

@Benchmark
public Object currentDecode() {
return v2Codec.decode(currentBytes);
}

@Benchmark
public Object olderDecode() {
return v2Codec.decode(olderBytes);
}

@Benchmark
public Object compactEncode() {
return v2CompactCodec.encode(person);
}

@Benchmark
public Object compactCurrentDecode() {
return v2CompactCodec.decode(currentCompactBytes);
}

@Benchmark
public Object compactOlderDecode() {
return v2CompactCodec.decode(olderCompactBytes);
}

// Evolution-off baselines for the current path. Pair each with its evolution-on counterpart
// (encode/currentDecode and the compact variants) to read the flag's overhead.
@Benchmark
public Object encodeNoEvolution() {
return v2PlainCodec.encode(person);
}

@Benchmark
public Object currentDecodeNoEvolution() {
return v2PlainCodec.decode(plainBytes);
}

@Benchmark
public Object compactEncodeNoEvolution() {
return v2PlainCompactCodec.encode(person);
}

@Benchmark
public Object compactCurrentDecodeNoEvolution() {
return v2PlainCompactCodec.decode(plainCompactBytes);
}

public static void main(String[] args) throws Exception {
if (args.length == 0) {
String commandLine =
"org.apache.fory.*SchemaEvolutionSuite.* -f 3 -wi 3 -i 3 -t 1 -w 2s -r 2s -prof gc -rf csv";
args = commandLine.split(" ");
}
LOG.info("command line: {}", Arrays.toString(args));
Main.main(args);
}
}
110 changes: 110 additions & 0 deletions docs/guide/java/row-format.md
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,116 @@ std::string str = bar10->get_string(0);
| Memory usage | Full object graph in memory | Only accessed fields |
| Suitable for | Small objects, full access | Large objects, selective access |

## Schema evolution
Comment thread
stevenschlansker marked this conversation as resolved.

Enable `.withSchemaEvolution()` on a row, array, or map codec builder to read payloads written
by older versions of the same bean. Writing always uses the current version; reading detects
the payload's version from a strict hash at the head of the payload. Java only.

Annotate fields added after v1 with `@ForyVersion(since = N)`:

```java
@Data
public class Person {
String name;
int age;

@ForyVersion(since = 2)
String email;
}
```

A v1 payload (with `name` and `age` only) decodes to a `Person` whose `email` is `null`.
Primitive fields added later default to `0`, `0.0`, or `false`. Unannotated fields are treated
as present from the first version, so a class can adopt versioning by annotating only the fields
added after v1.

For a record, the absent component's default is passed to the canonical constructor, so a
constructor that rejects `null` for a reference component added in a later version throws when
decoding an older payload. Let the constructor tolerate the missing value, for example by
normalizing `null` to a default:

```java
public record Person(String name, @ForyVersion(since = 2) String email) {
public Person {
if (email == null) {
email = "";
}
}
}
```

Remove a field by deleting the Java member and declaring it on a nested history interface as a
method with a `@ForyVersion(until = N)`. The method's return type carries any parameterized
type information from the original field.

```java
@Data
@ForySchema(removedFields = Person.History.class)
public class Person {
String name;

@ForyVersion(since = 2)
String email;

interface History {
@ForyVersion(until = 3)
int age();

@ForyVersion(until = 5)
List<String> tags();
}
}
```

The history method name matches the original live descriptor name. For field-backed beans
(Lombok `@Data`, records, or plain classes with a backing field) that is the field name
(`age`, `tags`). For interface beans, where the live member is a getter with no backing field,
it is the method name (`getAge`).

### Wire format and limitations

Producers and consumers must agree on the `withSchemaEvolution()` flag — they are not
wire-compatible otherwise. Row payloads always carry an 8-byte hash slot; under evolution its
Comment thread
stevenschlansker marked this conversation as resolved.
value is the strict hash (which includes field name and nullability), so a flag-mismatched
peer fails loudly with `ClassNotCompatibleException`. Arrays and maps of bean elements prepend
an 8-byte strict-hash prefix under evolution and no prefix otherwise; an evolution-on consumer
reading evolution-off bytes also fails with `ClassNotCompatibleException`, but the reverse
direction (evolution-off consumer, evolution-on bytes) is undefined.

To adopt the flag on an existing deployment, enable `withSchemaEvolution()` on both sides in a
release that changes no schema, then start evolving schemas only once every peer is on the
evolution-enabled build. Turning the flag on and changing a schema in the same release strands
any peer that has not yet upgraded.

Cross-language consumers (Python, C++) cannot read evolution-enabled payloads.

A reader selects the matching layout from the 8-byte strict hash on the payload. The hash includes
field names and nullability and is checked for collisions across a bean's own versions when the
codec is built, but it is still a 64-bit value: a payload whose hash coincides with one of the
reader's historical layouts is decoded against that layout. This is the same hash-based dispatch
the row format has always used, so feeding a codec bytes it was not built for has undefined results
whether or not evolution is enabled. Only hand a codec payloads produced for the same bean.

Nested evolution works to arbitrary depth and places no restriction on shape: a versioned bean
may contain versioned beans that themselves contain versioned beans, the same versioned bean
class may back more than one field, and fields typed as a non-evolving bean, a list, or a map are
unrestricted. Each nesting level is routed to the correct historical layout. A versioned bean may
be used as a map key as well as a map value, and the key and value evolve independently. This
holds wherever the map appears: as the codec's top-level type, nested inside a bean field, or
reached through a top-level array or map (such as `List<Map<KeyBean, ValueBean>>`), and a single
map may evolve more than one distinct bean class across its key and value. A top-level map carries
its own hash identifying both layouts together; a map nested inside an array, another map, or a
bean field has its layouts folded into the enclosing payload's hash.

When a versioned bean contains other versioned beans, the reader can read one projection layout per
combination of versions across the composition. A reader compiles a combination's codec the first
time it decodes a payload at that combination, so the cost tracks the historical versions you
actually receive, not the number you could in principle define. A map whose key and value both
evolve combines their versions the same way. Retiring an entry from a bean's `History` interface
once you no longer read payloads from that range stops the reader from accepting those payloads; it
is purely a read-side decision, and the writer always uses the current schema.

## Related Topics

- [Xlang Serialization](xlang-serialization.md) - xlang mode
Expand Down
10 changes: 10 additions & 0 deletions docs/specification/row_format_spec.md
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,16 @@ if (fixed_width % 8 == 0):

---

## Schema Evolution (Java Only)

Schema evolution lets a codec read payloads written by older versions of the same bean. It is implemented in Java only and does not change the cross-language wire contract above; producer and consumer must agree on whether it is enabled.

The Java encoder frames a row payload with a leading 8-byte schema-hash word. When evolution is enabled, that word holds a stricter hash that also distinguishes field names and nullability; otherwise it holds the format's default schema hash. Array and map payloads carry no hash word otherwise, so under evolution they gain an 8-byte strict-hash prefix. A map's prefix is a single hash that identifies the key and value layouts together, so a map key and value evolve independently while the payload still carries one hash.

See the [Java row format guide](../guide/java/row-format.md#schema-evolution) for usage, annotations, and limitations.

---

## Common Specifications

The following specifications apply to both standard and compact formats.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -966,7 +966,11 @@ private static boolean isSupported(TypeRef<?> typeRef, TypeResolutionContext ctx
return false;
}
Tuple2<TypeRef<?>, TypeRef<?>> mapKeyValueType = getMapKeyValueType(typeRef);
return isSupported(mapKeyValueType.f0) && isSupported(mapKeyValueType.f1);
// Thread ctx through both key and value, matching the iterable branch above. The single-arg
// isSupported overload builds a fresh context with synthesizeInterfaces=false and the empty
// custom-type registry, which would reject an interface bean used as a map key or value even
// though the same type is supported as a direct field or list element.
return isSupported(mapKeyValueType.f0, ctx) && isSupported(mapKeyValueType.f1, ctx);
} else if (cls.isEnum()) {
return true;
} else {
Expand Down
16 changes: 16 additions & 0 deletions java/fory-format/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,22 @@

<build>
<plugins>
<!--
Compile against the Java 11 API, not just to Java 11 bytecode. The parent sets only
source/target, which still resolves against the build JDK's bootclasspath, so a newer-than-11
API (e.g. the Java 16 record reflection API) compiles silently and then fails at runtime on
Java 11. release=11 makes such a leak a compile error here. Per-module: fory-core cannot use
release because it depends on sun.misc.Unsafe, which release excludes.
-->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<release>11</release>
<source combine.self="override"/>
<target combine.self="override"/>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.fory.format.annotation;

import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
import java.lang.annotation.Target;

/**
* Class-level row-codec schema metadata used when the codec builder enables schema evolution.
*
* <p>Live fields without a {@link ForyVersion} annotation are treated as present from the first
* version, so a class can adopt versioning by annotating only the fields added later.
*
* <p>{@link #removedFields()} points at a class (conventionally a nested {@code interface}) whose
* accessor methods describe fields that have been removed from this bean but still appear on the
* wire in older payloads. Each method's return type is the original Java type of the removed field;
* each method must carry a {@link ForyVersion} annotation with {@code until} set, since removed
* fields have a known end-of-life version.
*
* <p>Example:
*
* <pre>
* &#64;Data
* &#64;ForySchema(removedFields = MyBean.History.class)
* public class MyBean {
* private String name;
*
* interface History {
* &#64;ForyVersion(until = 3)
* List&lt;String&gt; tags();
*
* &#64;ForyVersion(since = 2, until = 5)
* Map&lt;String, Long&gt; counters();
* }
* }
* </pre>
*/
@Retention(RetentionPolicy.RUNTIME)
@Target(ElementType.TYPE)
public @interface ForySchema {
/**
* A class whose accessor methods describe historically-present-but-now-removed fields. Default
* {@code void.class} means there are no removed fields. The class is never instantiated; the
* codec reads its method signatures and annotations.
*/
Class<?> removedFields() default void.class;
}
Loading
Loading