#!/usr/bin/env bash
#
# Copyright 2013 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS-IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
###############################################################################
# Runs WordCount job that reads from Datastore and writes to BigQuery.
################################################################################

# Usage: ./test-mr-datastore-to-bigquery.sh [datasetId] [hadoopNameNode] [projectId] [outputDatasetId] [outputTableId] [optional: path to local jar to run]
# Set variables from args.
DATASET_ID=$1
HADOOP_NAME_NODE=$2
PROJECT_ID=$3
OUTPUT_DATASET_ID=$4
OUTPUT_TABLE_ID=$5
NUMBER_OF_ARGS=${#@}
if (( $NUMBER_OF_ARGS >= 6 )); then
  PATH_TO_JAR=$6
else
  PATH_TO_JAR="datastoretobigquery_wordcount.jar";
fi

# Push local jar to Google Compute Engine instance.
gcutil --project="$DATASET_ID" push "$HADOOP_NAME_NODE" $PATH_TO_JAR datastoretobigquery_wordcount.jar

# Run Hadoop job.
cat <<EOF >sample-hadoop.sh
# Set Hadoop environment variables
chmod 666 datastoretobigquery_wordcount.jar
sudo su hadoop
rm /home/hadoop/hadoop-install/datastoretobigquery_wordcount.jar
cp datastoretobigquery_wordcount.jar /home/hadoop/hadoop-install/datastoretobigquery_wordcount.jar
cd /home/hadoop/hadoop-install
. libexec/hadoop-config.sh
# Set the hadoop command and the path to the example jar
HADOOP_CMD="\${HADOOP_PREFIX}/bin/hadoop --config \$HADOOP_CONF_DIR"
# REPLACE parameters in line below.
COUNT_CMD="\$HADOOP_CMD jar /home/hadoop/hadoop-install/datastoretobigquery_wordcount.jar $DATASET_ID $PROJECT_ID $OUTPUT_DATASET_ID $OUTPUT_TABLE_ID hadoopSampleWordCountLine wordcount"
echo \$COUNT_CMD
eval \$COUNT_CMD
EOF

# SSH into Google Compute Engine instance and run sample.
gcutil --project "$DATASET_ID" ssh "$HADOOP_NAME_NODE" < sample-hadoop.sh
