Training script for Bigtop workshop session

# 1. Pull the container
docker pull c0sin/bigtop-ignite:h26-i12-s13

# 2. Run the container
docker run -t -i -d -h 'ignite.docker' c0sin/bigtop-ignite:h26-i12-s13 /bin/bash

# The container will run in the daemon mode. Run 
docker ps --all # to find the correct container ID (one with the same image name as above) and attach to it

docker attach XXX  # where XXX is the ID you found above
. /etc/profile.d/bigtop.sh
cd /bigtop

# 3. make sure we are on branch-1.0
git branch

# 4. Repo setup
# skip to step 5. if it is already added to the container image
# Setup the repo for Bigtop 1.0 release
cd /etc/apt/sources.list.d
wget https://www.apache.org/dist/bigtop/bigtop-1.0.0/repos/trusty/bigtop.list
# if the signing key isn't set on your system, run the following
  sudo apt-get install debian-keyring
  gpg --recv-key 3A367EA0FA08B173
  gpg --armor --export 3A367EA0FA08B173 | apt-key add -
# End of repo setup

# 5. Build custom package for Ignite
# we'll be using Hadoop 2.6 and Spark 1.3 from the Bigtop v1.0
# we'll build our own packages for Ignite 1.4 from the master branch
# You can skip the "vi" step as your image must have the change already
% vi bigtop.mk # change IGNITE_HADOOP_BASE_VERSION from 1.2.0 to 1.4; set
# IGNITE_HADOOP_TARBALL_SRC=ignite-$(IGNITE_HADOOP_BASE_VERSION).tar.gz

./gradlew ignite-hadoop-apt

# 6. let's do the deployment
# make sure everything Puppet needs is set
./gradlew toolchain-puppetmodules

# 6.1 Add locally created repo with ignite-hadoop packages in it
cat > /etc/apt/sources.list.d/bigtop-local.list << __EOF
# Bigtop local repo
deb file:///bigtop/output/apt bigtop contrib
__EOF

# 6.2 Update the packages
apt-get update

# 6.3 Prepare deployment files (already set in the docker image /etc/puppet/hieradata/site.yaml); 
# and deploy
sudo puppet apply -d --confdir=bigtop-deploy/puppet \
     --modulepath="bigtop-deploy/puppet/modules:/etc/puppet/modules" \
     bigtop-deploy/puppet/manifests/site.pp


# 7. Benchmarking

# 7.0 Preparations
# We need a special configuration to allow IgniteRDD to work for Spark cluster
cp /bigtop/igfs-config.xml /etc/ignite-hadoop/conf/default-config.xml 
service ignite-hadoop restart
export MR_JAR=/usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar

# 7.1 in-memory MR
# Run traditional MR vs in-memory MR
time hadoop jar $MR_JAR pi 20 20
time HADOOP_CONF_DIR=/etc/hadoop/ignite.client.conf hadoop jar $MR_JAR pi 20 20

# run IO-bound MR w/ IGFS and w/o

# 7.2 Do generation
time hadoop jar $MR_JAR teragen 100000 /user/root/tera.out.t
time HADOOP_CONFIG_DIR=/etc/hadoop/ignite.client.conf hadoop jar $MR_JAR \
     teragen 100000 /user/root/tera.out.imc

# 7.3 Do sorting
time hadoop jar $MR_JAR terasort /user/root/tera.out.t /user/root/sort.out.t
time HADOOP_CONF_DIR=/etc/hadoop/ignite.client.conf hadoop jar $MR_JAR \
     terasort /user/root/tera.out.imc /user/root/sort.out.imc

# 7.4 Validate (optional)
time hadoop jar $MR_JAR teravalidate /user/root/sort.out.imc \
     /user/root/validate.report.t
time HADOOP_CONF_DIR=/etc/hadoop/ignite.client.conf hadoop jar $MR_JAR \
     teravalidate /user/root/sort.out.imc /user/root/validate.report.imc


# 8. Sharing the state between Spark jobs using Ignite Fabric
# Spark 1.3.1 (w/ Ignite 1.3+)

# 8.1 shutdown running Ignite service
service ignite-hadoop stop
###!!! There are some permissions issues, that haven't been fixed in Bigtop 1.0
chmod a+w /usr/lib/ignite-hadoop/work/*
mkdir -p /tmp/ignite/work && chmod a+w /tmp/ignite/work

# 8.2 Run a server node with spark-ignite configuration
sudo -u spark DEFAULT_CONFIG=/bigtop/spark-ignite-config.xml \
     /usr/lib/ignite-hadoop/bin/ignite.sh 2>&1 > /tmp/nshell.log &

# 8.3 Start Spark Shell
sudo -u spark spark-shell --packages org.apache.ignite:ignite-spark:1.4.0-SNAPSHOT \
     --repositories https://repository.apache.org/content/groups/snapshots \
     --master spark://ignite.docker:7077

# 8.4 Execute these commands in the shell
import org.apache.ignite.spark._
import org.apache.ignite.configuration._

val ic = new IgniteContext[Integer, Integer](sc, "/bigtop/spark-ignite-config.xml")

val sharedRDD = ic.fromCache("SharedNumbers")

sharedRDD.filter(_._2 < 10).collect()
sharedRDD.savePairs(sc.parallelize(1 to 100000, 10).map(i => (i, i*2)))

sharedRDD.filter(_._2 > 90000).count

sharedRDD.sql("select count(_val) from Integer where _val > ?", 90000).collect()

# 8.5 Restart (stop & start) Spark Shell with the command 8.3
# Note that commands below don't store state, only read the data from existing Ignite cache

# 8.6 Execute these commands
import org.apache.ignite.spark._
import org.apache.ignite.configuration._

val ic = new IgniteContext[Integer, Integer](sc, "/bigtop/spark-ignite-config.xml")

val sharedRDD = ic.fromCache("SharedNumbers")
// Now it just works with the data which has been stored in the cache already
sharedRDD.filter(_._2 > 90000).count

sharedRDD.sql("select count(_val) from Integer where _val > ?", 90000).collect()

# The end