Avro是Hadoop的⼀个⼦项⽬,由Hadoop的创始⼈Doug Cutting(也是Lucene,Nutch等项⽬的创始⼈)牵头开发。Avro是⼀个数据序列化系统,设计⽤于⽀持⼤批量数据交换的应⽤。它的主要特点有:⾃描述;紧凑的序列化格式;压缩⽀持;动态模式解析;跨语⾔⽀持;⾼效的序列化.Avro是⼀种灵活和⾼效的序列化解决⽅案,适⽤于⼤数据和实时处理环境。其⾃描述性和对模式演进的⽀持,使其在许多复杂和动态的系统中都是理想选择。
Avro数据包含了schema,使得⽂件或消息具有⾃描述的能⼒。这意味着你可以直接从数据本⾝理解其结构,不必依赖外部的模式定义。在写⼊数据时,需要先定义schema。Avro的schema采⽤JSON格式编写,可以通过以下三种⽅式⽣成:
public class AvroDemo1 {
public static void main(String[] args) throws IOException {
Student student = new Student(104, "TEST_STUDENT");
// 根据 Student 类创建 Schema
Schema schema = ReflectData.get().getSchema(Student.class);
File file = new File(SerdeConst.BASE_PATH +"student1.avro");
DatumWriter<Student> studentDatumWriter = new ReflectDatumWriter<>(Student.class);
DataFileWriter<Student> studentDataFileWriter = new DataFileWriter<>(studentDatumWriter);
studentDataFileWriter.create(schema, file);
for (int i = 0; i < 100000; i++)
studentDataFileWriter.append(student);
studentDataFileWriter.close();
File file2 = new File(SerdeConst.BASE_PATH + "student1_codec.avro");
DatumWriter<Student> studentDatumWriter2 = new ReflectDatumWriter<>(Student.class);
DataFileWriter<Student> studentDataFileWriter2 = new DataFileWriter<>(studentDatumWriter2);
//设置压缩方式为Deflate
studentDataFileWriter2.setCodec(CodecFactory.deflateCodec(CodecFactory.DEFAULT_DEFLATE_LEVEL));
//studentDataFileWriter2.setCodec(CodecFactory.snappyCodec()); //snappy压缩,需要引入snappy依赖包
studentDataFileWriter2.create(schema, file2);
for (int i = 0; i < 100000; i++)
studentDataFileWriter2.append(student);
studentDataFileWriter2.close();
// 从磁盘反序列化 Students
DatumReader<Student> studentDatumReader = new ReflectDatumReader<>(Student.class);
DataFileReader<Student> studentDataFileReader = new DataFileReader<>(file, studentDatumReader);
Student deStudent = null;
while (studentDataFileReader.hasNext()) {
deStudent = studentDataFileReader.next();
}
System.out.println(deStudent);
}
}
{
"type": "record",
"name": "Student",
"namespace":"com.serde.avro",
"fields": [
{
"name": "no",
"type": "int"
},
{
"name": "name",
"type": "string"
}
]
}
public class AvroDemo2 {
public static void main(String[] args) throws IOException {
// 加载模式
InputStream stream2 = AvroDemo2.class.getClassLoader().getResourceAsStream("student.avsc");
Schema schema = new Schema.Parser().parse(stream2);
// 创建一些用户
GenericData.Record user1 = new GenericData.Record(schema);
user1.put("name", "TEST_STUDENT");
user1.put("no", 106);
// 将用户序列化到磁盘
File file = new File(SerdeConst.BASE_PATH +"student2.avro");
DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter);
dataFileWriter.create(schema, file);
for (int i = 0; i < SerdeConst.LENGTH; i++)
dataFileWriter.append(user1);
dataFileWriter.close();
// 现在,让我们从磁盘中读取并反序列化我们的用户
DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(schema);
DataFileReader<GenericRecord> dataFileReader = new DataFileReader<>(file, datumReader);
GenericRecord user = null;
while (dataFileReader.hasNext()) {
user = dataFileReader.next(user);
}
System.out.println(user.get("name").toString());
System.out.println(user.get("no").toString());
// 从磁盘反序列化 Students
DatumReader<Student> studentDatumReader = new ReflectDatumReader<>(Student.class);
DataFileReader<Student> studentDataFileReader = new DataFileReader<>(file, studentDatumReader);
Student student = null;
while (studentDataFileReader.hasNext()) {
student = studentDataFileReader.next(student);
}
System.out.println(student);
}
}
先定义schema,内容同上使⽤Avaro ⼯具包,⾃动⽣成代码。这⾥使⽤maven的avro-maven-plugin 插件
<plugin>
<groupId>org.apache.avro</groupId>
<artifactId>avro-maven-plugin</artifactId>
<version>${avro.version}</version>
<executions>
<execution>
<phase>generate-sources</phase>
<goals>
<goal>schema</goal>
</goals>
<configuration>
<sourceDirectory>${project.basedir}/src/main/avro</sourceDirectory>
<outputDirectory>${project.build.directory}/generated-sources/java</outputDirectory>
</configuration>
</execution>
</executions>
</plugin>
public class AvroDemo3 {
public static void main(String[] args) throws IOException {
// 创建学生对象
Student student = new Student(105,"TEST_STUDENT");
// 创建文件
File file = new File(SerdeConst.BASE_PATH+"student3.avro");
// 创建一个DatumWriter,并用它来构建一个DataFileWriter
DatumWriter<Student> studentDatumWriter = new SpecificDatumWriter<>(Student.class);
DataFileWriter<Student> dataFileWriter = new DataFileWriter<>(studentDatumWriter);
dataFileWriter.create(student.getSchema(), file);
for(int i=0;i<SerdeConst.LENGTH;i++)
dataFileWriter.append(student);
dataFileWriter.close();
// 创建一个DatumReader,并用它来构建一个DataFileReader
DatumReader<Student> studentDatumReader = new SpecificDatumReader<>(Student.class);
DataFileReader<Student> dataFileReader = new DataFileReader<>(file, studentDatumReader);
Student student1 = null;
while (dataFileReader.hasNext()) {
student1 = dataFileReader.next(student1);
}
System.out.println(student1);
}
}