# Copyright 2014 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS-IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Environment variables to be used in the local bdutil as well as in setup
# scripts running on remote VMs; this file will be used as a preamble to each
# partial setup script being run on each VM.
#
# Edit values here before running bdutil.
# CONFIGBUCKET and PROJECT are required.

############### REQUIRED ENVIRONMENT VARIABLES (no defaults) ##################

# A GCS bucket used for sharing generated SSH keys and GHFS configuration.
CONFIGBUCKET=""

# The Google Cloud Platform text-based project-id which owns the GCE resources.
PROJECT=""

###############################################################################

###################### Cluster/Hardware Configuration #########################
# These settings describe the name, location, shape and size of your cluster,
# though these settings may also be used in deployment-configuration--for
# example, to whitelist intra-cluster SSH using the cluster prefix.

# GCE settings.
GCE_IMAGE='backports-debian-7'
GCE_MACHINE_TYPE='n1-standard-4'
GCE_ZONE='us-central1-b'
# When setting a network it's important for all nodes be able to communicate
# with eachother and for SSH connections to be allowed inbound to complete
# cluster setup and configuration.
GCE_NETWORK='default'

# Prefix to be shared by all VM instance names in the cluster, as well as for
# SSH configuration between the JobTracker node and the TaskTracker nodes.
PREFIX='hs-ghfs'

# The number of worker nodes in the cluster.
NUM_WORKERS=2

# If true, tries to attach the PDs listed in WORKER_ATTACHED_PDS and
# NAMENODE_ATTACHED_PD to their respective VMs as a non-boot volume. By default,
# the PDS will be named after the instance names with a "-pd" suffix.
USE_ATTACHED_PDS=false

# Only applicable if USE_ATTACHED_PDS is true; if so, this variable controls
# whether the PDs should be created explicitly during deployment. The PDs
# must not already exist.
CREATE_ATTACHED_PDS_ON_DEPLOY=true

# Only applicable if USE_ATTACHED_PDS is true; if so, this variable controls
# whether the PDs should be deleted explicitly when deleting the cluster.
DELETE_ATTACHED_PDS_ON_DELETE=true

# Only applicable during deployment if USE_ATTACHED_PDS is true and
# CREATE_ATTACHED_PDS_ON_DEPLOY is true. Specifies the size, in GB, of
# each non-boot PD to create for the worker nodes.
WORKER_ATTACHED_PDS_SIZE_GB=500

# Only applicable during deployment if USE_ATTACHED_PDS is true and
# CREATE_ATTACHED_PDS_ON_DEPLOY is true. Specifies the size, in GB, of
# the non-boot PD to create for the master node.
NAMENODE_ATTACHED_PD_SIZE_GB=500

# Comma-separated list of service-account scopes to include in the created VMs.
# List of available scopes can be obtained with 'gcutil help addinstance' and
# looking under the description for "--service_account_scopes".
# Must at least include 'storage-full' for gsutil and the GCS connector to work.
GCE_SERVICE_ACCOUNT_SCOPES='storage-full'

# List of expanded worker-node names; generally should just be derived from
# $PREFIX and $NUM_WORKERS inside 'evaluate_late_variable_bindings'; leave
# unchanged if in doubt.
WORKERS=()

# List of expanded per-worker-node PD names. Only applicable if USE_ATTACHED_PDS
# is true. Generated inside 'evaluate_late_variable_bindings' by default; leave
# unchanged if in doubt.
WORKER_ATTACHED_PDS=()

###############################################################################

#################### Deployment/Software Configuration ########################
# These settings are used by installation and configuration scripts running
# inside the VM to customize your Hadoop installation.

# Whether or not to install and configure the Cloud Storage connector.
# Must be true if DEFAULT_FS is gs
INSTALL_GCS_CONNECTOR=true

# Whether or not to install and configure the BigQuery connector.
INSTALL_BIGQUERY_CONNECTOR=false

# Whether or not to install and configure the Datastore connector.
INSTALL_DATASTORE_CONNECTOR=false

# Whether or not to configure and start HDFS
# Must be true if DEFAULT_FS is hdfs
ENABLE_HDFS=true

#Whether or not to check permissions for accessing HDFS files
ENABLE_HDFS_PERMISSIONS=false

# One of [gs|hdfs].
DEFAULT_FS='gs'

# Options to be passed to TaskTracker child JVMs.
JAVAOPTS='-Xms1024m -Xmx2048m'

# Complete URL for downloading the GCS Connector JAR file.
GCS_CONNECTOR_JAR='https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-1.2.7-hadoop1.jar'

# Complete URL for downloading the BigQuery Connector JAR file.
BIGQUERY_CONNECTOR_JAR='https://storage.googleapis.com/hadoop-lib/bigquery/bigquery-connector-0.4.2.jar'

# Complete URL for downloading the Cloud Datastore Connector JAR file.
DATASTORE_CONNECTOR_JAR='https://storage.googleapis.com/hadoop-lib/datastore/datastore-connector-0.14.5.jar'

# Complete URL for downloading the configuration script.
BDCONFIG='https://storage.googleapis.com/hadoop-tools/bdconfig/bdconfig-0.28.1.tar.gz'

# URI of Hadoop tarball to be deployed. Must begin with gs:// or http(s)://
# Use 'gsutil ls gs://hadoop-dist/hadoop-*.tar.gz' to list Google supplied options
HADOOP_TARBALL_URI='gs://hadoop-dist/hadoop-1.2.1-bin.tar.gz'

# Directory where Hadoop is to be installed
HADOOP_INSTALL_DIR='/home/hadoop/hadoop-install'

# Directory holding config files and scripts for Hadoop
HADOOP_CONF_DIR="${HADOOP_INSTALL_DIR}/conf"

# If true, strips out external apt-get mirrors from /etc/apt/sources.list
# before apt-get installing the JRE. Should only be used for
# non-critical/non-sensitive deployments due to possibly omitting security
# patches from, e.g. security.debian.org.
STRIP_EXTERNAL_MIRRORS=false

###############################################################################

############################# bdutil settings #################################
# These settings don't directly affect your cluster, but simply control the
# rate, verbosity, timeouts, etc., of bdutil itself.

# Number of seconds for gcutil to wait for commands to finish before
# declaring the attempt a failure.
GCUTIL_TIMEOUT_SECONDS=600

# Number of seconds between polling operations from gcutil waiting for
# addinstance to finish. Should be increased for larger clusters to avoid
# hitting rate quota limits.
GCUTIL_POLL_INTERVAL_SECONDS=10

# Number of seconds, not necessarily a whole number, to sleep between
# invocations of async API calls. Mitigates flooding too many concurrent API
# calls at once during deployment.
GCUTIL_SLEEP_TIME_BETWEEN_ASYNC_CALLS_SECONDS='0.1'

# If true, tee gcutil's stdout and stderr to console in addition to logfiles,
# otherwise only send its stdout and stderr to the logfiles.
VERBOSE_MODE=false

# During deployment, the maximum number of async subprocesses to use
# concurrently; can be increased if using a larger machine. Default value is
# suitable for running out of a dedicated n1-standard-1 VM.
MAX_CONCURRENT_ASYNC_PROCESSES=150

###############################################################################

# Helper function for normalizing boolean variables to 1/0 instead of
# true/false, respectively. We prefer to use arithmetic [1|0] instead of bash
# "true|false" and use (()) for conditions to avoid inadvertent eval of
# arbitrary strings.
function normalize_boolean() {
  local var_name=$1
  if [[ "${!var_name}" == 'true' ]]; then
    eval "${var_name}=1"
  elif [[ "${!var_name}" == 'false' ]]; then
    eval "${var_name}=0"
  fi
}

# Overridable function which will be called after sourcing all provided env
# files in sequence; allows environment variables which are derived from other
# variables to reflect overrides introduced in other files. For example, by
# computing WORKERS and NAMENODE_HOSTNAME as a late binding, an override file
# needs only to redefine PREFIX in order to adopt the new WORKERS and
# NAMENODE_HOSTNAME values as well.
function evaluate_late_variable_bindings() {
  normalize_boolean 'STRIP_EXTERNAL_MIRRORS'
  normalize_boolean 'ENABLE_HDFS'
  normalize_boolean 'INSTALL_GCS_CONNECTOR'
  normalize_boolean 'INSTALL_BIGQUERY_CONNECTOR'
  normalize_boolean 'INSTALL_DATASTORE_CONNECTOR'
  normalize_boolean 'USE_ATTACHED_PDS'
  normalize_boolean 'CREATE_ATTACHED_PDS_ON_DEPLOY'
  normalize_boolean 'DELETE_ATTACHED_PDS_ON_DELETE'
  normalize_boolean 'VERBOSE_MODE'

  # Generate WORKERS array based on PREFIX and NUM_WORKERS.
  for ((i = 0; i < NUM_WORKERS; i++)); do
    WORKERS[${i}]="${PREFIX}-dn-${i}"
  done

  # The instance name of the VM which serves as both the namenode and
  # jobtracker.
  NAMENODE_HOSTNAME="${PREFIX}-nn"

  # Generate worker PD names based on the worker instance names.
  for ((i = 0; i < NUM_WORKERS; i++)); do
    WORKER_ATTACHED_PDS[${i}]="${WORKERS[${i}]}-pd"
  done

  # List of expanded master-node PD name. Only applicable if USE_ATTACHED_PDS
  # is true.
  NAMENODE_ATTACHED_PD="${NAMENODE_HOSTNAME}-pd"

  # Fully qualified HDFS URI of namenode
  NAMENODE_URI="hdfs://${NAMENODE_HOSTNAME}:8020/"

  # Host and port of jobtracker
  JOB_TRACKER_URI="${NAMENODE_HOSTNAME}:9101"

  # GCS directory for deployment-related temporary files.
  local staging_dir_base="gs://${CONFIGBUCKET}/bdutil-staging"
  BDUTIL_GCS_STAGING_DIR="${staging_dir_base}/${NAMENODE_HOSTNAME}"
}

# Array of files, either absolute or relative to the directory where bdutil
# resides, to upload to every node before executing further commands. The files
# will all be placed in the same directory as the scripts being executed.
UPLOAD_FILES=()
if [[ -n "${BDUTIL_DIR}" ]]; then
  UPLOAD_FILES+=($(find ${BDUTIL_DIR}/conf -name '*template.xml'))
fi

# Array of strings representing mapping from command step names to the scripts
# to be executed in those steps. The first line of each group must be the name
# and end with a colon. Following the colon must be a whitespace-separated list
# of files relative to the directory where bdutil resides. Files may also be
# absolute paths.
#
# Names (portion of each element before the first ':') must be suitable for
# use as a substring inside a filename.
COMMAND_GROUPS=(
  "deploy-ssh-setup:
     libexec/setup_namenode_ssh.sh
  "

  "deploy-core-setup:
     libexec/install_java.sh
     libexec/mount_disks.sh
     libexec/setup_hadoop_user.sh
     libexec/install_hadoop.sh
     libexec/install_bdconfig.sh
     libexec/configure_hadoop.sh
     libexec/install_and_configure_gcs_connector.sh
     libexec/install_and_configure_bigquery_connector.sh
     libexec/install_and_configure_datastore_connector.sh
     libexec/configure_hdfs.sh
     libexec/set_default_fs.sh
  "

  "deploy-ssh-data-setup:
     libexec/setup_datanode_ssh.sh
  "

  "deploy-start:
     libexec/start_hadoop.sh
  "

  # Use with run_command_group install_connectors to configure a pre-existing
  # Hadoop cluster witch the connectors.
  "install_connectors:
     libexec/install_bdconfig.sh
     libexec/install_and_configure_gcs_connector.sh
     libexec/install_and_configure_bigquery_connector.sh
     libexec/install_and_configure_datastore_connector.sh
     libexec/set_default_fs.sh
  "
)

# Array of comma-separated pairs referring to the COMMAND_GROUPS previously
# defined, of the form <invoke-on-master>,<invoke-on-all-workers>. Within
# an element, the commands will be concurrently invoked on all VMs using
# ssh sessions running in the background. All such async invocations will
# be awaited for completion before continuing to the next step.
#
# Use '*' to specify a no-op, for example if a command must be completed on
# only the master node before running the next step on all workers.
COMMAND_STEPS=(
  "deploy-ssh-setup,*"
  'deploy-core-setup,deploy-core-setup'
  "*,deploy-ssh-data-setup"
  "deploy-start,*"
)
