Avro:当我不知道“writer”使用的确切模式时,如何使用默认字段

在Java Avro中,如何将下面的data1data2data3解析为GenericRecord

//Schema { "type": "record", "name": "user", "fields": [ {"name": "name", "type": "string"}, {"name": "colour", "type": "string", "default": "green"}, {"name": "mass", "type": "int", "default": 100} ] } //data 1 {"name":"Sean"} //data 2 {"name":"Sean", "colour":"red"} //data 3 {"name":"Sean", "colour":"red", "mass":200} 

我已经看过一些关于模式演化等的讨论,并且能够将作者的模式读者的模式传递给GenericDatumReader和ResolvingDecoder,但我只有一个模式。 一般来说,我不知道作者使用的确切模式(如果有的话)。

我可以通过解析模式并使用默认值删除所有字段来“推断”“基础”模式。 但是,如果有多个字段具有默认值,则某些字段可能存在/可能不存在,因此我将无法推断出符合数据的模式。

例如

  • 如果我使用给定的模式尝试GenericDatumReader来读取数据3,那么解析就成功了。
  • 如果我使用推断的模式尝试GenericDatumReader来读取data1,那么解析就成功了。
  • 如果我使用ResolvingDecoder使用推断架构和给定架构来尝试GenericDatumReader来读取数据1,那么解析就成功了。
  • 所有其他选项都无法将data1data3解析为GenericRecord,其中包含JSON字符串的所有值以及缺少字段的相应默认值。
  • 并且似乎根本无法正确解析data2!

有人有什么建议吗?

 import java.io.IOException; import org.apache.avro.Schema; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericRecord; import org.apache.avro.io.DecoderFactory; import org.apache.avro.io.JsonDecoder; import org.apache.avro.io.ResolvingDecoder; public class DefaultAvroTest2 { private static String properSchama_string = "{" + " \"type\": \"record\"," + " \"name\": \"user\"," + " \"fields\": [" + " {\"name\": \"name\", \"type\": \"string\"}," + " {\"name\": \"colour\", \"type\": \"string\", \"default\": \"green\"}," + " {\"name\": \"mass\", \"type\": \"int\", \"default\": 100}" + " ]" + " }"; private static String inferred_base_schama_string = "{" + " \"type\": \"record\"," + " \"name\": \"user\"," + " \"fields\": [" + " {\"name\": \"name\", \"type\": \"string\"}" + " ]" + " }"; private static String data1 = "{\"name\":\"Sean\"}"; private static String data2 = "{\"name\":\"Sean\", \"colour\":\"red\"}"; private static String data3 = "{\"name\":\"Sean\", \"colour\":\"blue\", \"mass\":200}"; public static void main(String[] args) throws IOException { System.out.println("\nObject 1 :\n"+data1); System.out.println("\nObject 2 :\n"+data2); System.out.println("\nObject 3 :\n"+data3); Schema inferred_base_schema = new Schema.Parser().parse(inferred_base_schama_string); Schema defined_schema = new Schema.Parser().parse(properSchama_string); System.out.println("\nProper schema :\n"+defined_schema.toString(true)); System.out.println("\nA base schema that could be inferred from the proper schema :\n"+inferred_base_schema.toString(true)); JsonDecoder jsonDecoder_inferred_1 = DecoderFactory.get().jsonDecoder(inferred_base_schema, data1); JsonDecoder jsonDecoder_inferred_2 = DecoderFactory.get().jsonDecoder(inferred_base_schema, data2); JsonDecoder jsonDecoder_inferred_3 = DecoderFactory.get().jsonDecoder(inferred_base_schema, data3); //Correct GenericRecord object1_inferred = new GenericDatumReader(inferred_base_schema).read(null, jsonDecoder_inferred_1); //Incorrect: colour is missing GenericRecord object2_inferred = new GenericDatumReader(inferred_base_schema).read(null, jsonDecoder_inferred_2); //Incorrect: colour and mass are missing GenericRecord object3_inferred = new GenericDatumReader(inferred_base_schema).read(null, jsonDecoder_inferred_3); ResolvingDecoder resolvingDecoder1 = DecoderFactory.get().resolvingDecoder(inferred_base_schema, defined_schema, DecoderFactory.get().jsonDecoder(defined_schema, data1)); ResolvingDecoder resolvingDecoder2 = DecoderFactory.get().resolvingDecoder(inferred_base_schema, defined_schema, DecoderFactory.get().jsonDecoder(defined_schema, data2)); ResolvingDecoder resolvingDecoder3 = DecoderFactory.get().resolvingDecoder(inferred_base_schema, defined_schema, DecoderFactory.get().jsonDecoder(defined_schema, data3)); //Correct GenericRecord object1_resolved = new GenericDatumReader(defined_schema).read(null, resolvingDecoder1); //Incorrect: colour is default(green) not red GenericRecord object2_resolved = new GenericDatumReader(defined_schema).read(null, resolvingDecoder2); //Incorrect: colour is default(green) not blue, and mass is default(100) not 200 GenericRecord object3_resovled = new GenericDatumReader(defined_schema).read(null, resolvingDecoder3); JsonDecoder jsonDecoder_defined_1 = DecoderFactory.get().jsonDecoder(defined_schema, data1); JsonDecoder jsonDecoder_defined_2 = DecoderFactory.get().jsonDecoder(defined_schema, data2); JsonDecoder jsonDecoder_defined_3 = DecoderFactory.get().jsonDecoder(defined_schema, data3); //Fail: org.apache.avro.AvroTypeException: Expected string. Got END_OBJECT //GenericRecord object1_defined = new GenericDatumReader(defined_schema).read(null, jsonDecoder_defined_1); //Fail: org.apache.avro.AvroTypeException: Expected int. Got END_OBJECT //GenericRecord object2_defined = new GenericDatumReader(defined_schema).read(null, jsonDecoder_defined_2); //Correct GenericRecord object3_defined = new GenericDatumReader(defined_schema).read(null, jsonDecoder_defined_3); //Correct System.out.println("\nObject 1 read with inferred schema:\n"+object1_inferred); //Incorrect: colour is missing System.out.println("\nObject 2 read with inferred schema:\n"+object2_inferred); //Incorrect: colour and mass are missing System.out.println("\nObject 3 read with inferred schema:\n"+object3_inferred); //Correct System.out.println("\nObject 1 read with resolving decoder:\n"+object1_resolved); //Incorrect: colour is default(green) not red System.out.println("\nObject 2 read with resolving decoder:\n"+object2_resolved); //Incorrect: colour is default(green) not blue, and mass is default(100) not 200 System.out.println("\nObject 3 read with resolving decoder:\n"+object3_resovled); //Fail //System.out.println("\nObject 1 read with defined schema:\n"+object1_defined); //Fail //System.out.println("\nObject 2 read with defined schema:\n"+object2_defined); //Correct System.out.println("\nObject 3 read with defined schema:\n"+object3_defined); } } 

输出:

 Object 1 : {"name":"Sean"} Object 2 : {"name":"Sean", "colour":"red"} Object 3 : {"name":"Sean", "colour":"blue", "mass":200} Proper schema : { "type" : "record", "name" : "user", "fields" : [ { "name" : "name", "type" : "string" }, { "name" : "colour", "type" : "string", "default" : "green" }, { "name" : "mass", "type" : "int", "default" : 100 } ] } A base schema that could be inferred from the proper schema : { "type" : "record", "name" : "user", "fields" : [ { "name" : "name", "type" : "string" } ] } Object 1 read with inferred schema: {"name": "Sean"} Object 2 read with inferred schema: {"name": "Sean"} Object 3 read with inferred schema: {"name": "Sean"} Object 1 read with resolving decoder: {"name": "Sean", "colour": "green", "mass": 100} Object 2 read with resolving decoder: {"name": "Sean", "colour": "green", "mass": 100} Object 3 read with resolving decoder: {"name": "Sean", "colour": "green", "mass": 100} Object 3 read with defined schema: {"name": "Sean", "colour": "blue", "mass": 200}