Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

...

job.name=GobblinHdfsQuickStart

job.group=GobblinHdfs

job.description=Gobblin quick start job for Hdfs

job.lock.enabled=false

 

fs.uri=hdfs://localhost:9000

 

source.class=org.apache.gobblin.source.extractor.hadoop.AvroFileSource

extract.namespace=org.apache.gobblin.source.extractor.hadoop

extract.table.type=SNAPSHOT_ONLY

 

source.filebased.data.directory=/data/wikipedia/Wikipedia_Sandbox

source.filebased.fs.uri=hdfs://localhost:9000

 

writer.builder.class=org.apache.gobblin.kafka.writer.KafkaDataWriterBuilder

writer.kafka.topic=WikipediaExample

writer.kafka.producerConfig.bootstrap.servers=localhost:9092

 

##Confluent Schema Registry and serializers

#writer.kafka.producerConfig.value.serializer=io.confluent.kafka.serializers.KafkaAvroSerializer

#writer.kafka.producerConfig.key.serializer=io.confluent.kafka.serializers.KafkaAvroSerializer

#writer.kafka.producerConfig.schema.registry.url=http://localhost:8081

 

#Use Local Schema Registry and serializers

writer.kafka.producerConfig.value.serializer=org.apache.gobblin.kafka.serialize.LiAvroSerializer

writer.kafka.producerConfig.kafka.schemaRegistry.class=org.apache.gobblin.kafka.schemareg.ConfigDrivenMd5SchemaRegistry

writer.kafka.producerConfig.schemaRegistry.schema.name=WikipediaExample

writer.kafka.producerConfig.schemaRegistry.schema.value={"namespace": "example.wikipedia.avro","type": "record","name": "WikipediaArticle","fields": [{"name": "pageid", "type": ["double", "null"]},{"name\

": "title", "type": ["string", "null"]},{"name": "user", "type": ["string", "null"]},{"name": "anon", "type": ["string", "null"]},{"name": "userid",  "type": ["double", "null"]},{"name": "timestamp", "ty\

pe": ["string", "null"]},{"name": "size",  "type": ["double", "null"]},{"name": "contentformat",  "type": ["string", "null"]},{"name": "contentmodel",  "type": ["string", "null"]},{"name": "content", "ty\

pe": ["string", "null"]}]}

 

data.publisher.type=org.apache.gobblin.publisher.NoopPublisher

 

metrics.reporting.file.enabled=true

metrics.log.dir=/tmp/suvasude/metrics

metrics.reporting.file.suffix=txt

task.data.root.dir=/tmp

  

Mapreduce

  • Set up a single node Kafka broker as in the standalone mode
  • Set up a single node Hadoop cluster in pseudo-distributed mode as explained here. Follow the instructions to set up YARN cluster. 
  • Create a job config with the following properties:
  • job.name=GobblinHdfsMRQuickStart

    job.group=GobblinHdfsMR

    job.description=Gobblin quick start job for Hdfs

    job.lock.enabled=false

     

    launcher.type=MAPREDUCE

     

    fs.uri=hdfs://localhost:9000

    source.class=org.apache.gobblin.example.hadoop.HadoopTextFileSource

    extract.namespace=org.apache.gobblin.example.hadoop

    extract.table.name=test

    extract.table.type=APPEND_ONLY

     

    writer.fs.uri=hdfs://localhost:9000

    state.store.fs.uri=hdfs://localhost:9000

     

    source.hadoop.file.input.format.class=org.apache.hadoop.mapreduce.lib.input.TextInputFormat

    source.hadoop.file.splits.desired=1

    source.hadoop.file.input.paths=hdfs://localhost:9000/data/test

     

    converter.classes=org.apache.gobblin.converter.string.ObjectToStringConverter

     

    writer.builder.class=org.apache.gobblin.kafka.writer.KafkaDataWriterBuilder

    writer.kafka.topic=MRTest

    writer.kafka.producerConfig.bootstrap.servers=localhost:9092

    writer.kafka.producerConfig.value.serializer=org.apache.kafka.common.serialization.StringSerializer

     

    data.publisher.type=org.apache.gobblin.publisher.NoopPublisher

     

    mr.job.max.mappers=1

     

    metrics.reporting.file.enabled=true

    metrics.log.dir=/tmp/suvasude/metrics

    metrics.reporting.file.suffix=txt

     

    mr.job.root.dir=/gobblin-kafka/working

    state.store.dir=/gobblin-kafka/state-store

    task.data.root.dir=/jobs/kafkaetl/gobblin/gobblin-kafka/task-data