需求
需将Hadoop任意版本软件包构建为容器镜像。 注意,文章不包含代码打包成品文件的操作。
构建基础镜像
若有定制基础环境需求,可构建基础镜像。若无定制需求,直接使用Hadoop官方基础镜像。忽略本步骤。
参考文档
准备文件
系统不存在的目录需自己新建。
在目录/base中准备文件Dockerfile
FROM ubuntu:20.04
ENV DEBIAN_FRONTEND=noninteractive
ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
ENV PATH=$PATH:/opt/hadoop/bin
ENV HADOOP_LOG_DIR=/var/log/hadoop
ENV HADOOP_CONF_DIR=/etc/hadoop
RUN apt-get update && apt-get install -y \
sudo \
python2 \
wget \
curl \
netcat-openbsd \
jq \
openjdk-8-jdk \
krb5-user \
tar \
gzip && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN wget https://bootstrap.pypa.io/pip/2.7/get-pip.py && \
python2 get-pip.py && \
rm -rf get-pip.py
RUN pip2 install robotframework
RUN wget -O /usr/local/bin/dumb-init https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64 && \
chmod +x /usr/local/bin/dumb-init
RUN mkdir -p /etc/security/keytabs && \
chmod -R a+wr /etc/security/keytabs
RUN wget -O /opt/byteman.jar https://repo.maven.apache.org/maven2/org/jboss/byteman/byteman/4.0.4/byteman-4.0.4.jar && \
chmod o+r /opt/byteman.jar
RUN mkdir -p /opt/profiler && \
cd /opt/profiler && \
curl -L https://github.com/jvm-profiling-tools/async-profiler/releases/download/v1.5/async-profiler-1.5-linux-x64.tar.gz | tar xvz
RUN groupadd --gid 1000 hadoop && \
useradd --uid 1000 --gid 100 --home /opt/hadoop --shell /bin/bash hadoop && \
echo "hadoop ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers && \
chown -R hadoop:hadoop /opt && \
mkdir -p /etc/hadoop /var/log/hadoop && \
chmod 1777 /etc/hadoop && \
chmod 1777 /var/log/hadoop
ADD scripts /opt/
ADD scripts/krb5.conf /etc/
WORKDIR /opt/hadoop
VOLUME /data
USER hadoop
ENTRYPOINT ["/usr/local/bin/dumb-init", "--", "/opt/starter.sh"]
在目录/base/scripts中准备以下文件
文件.bashrc
PS1="\u@\h: \w> "
文件envtoconf.py
"""convert environment variables to config"""
import os
import re
import argparse
import sys
import transformation
class Simple(object):
"""Simple conversion"""
def __init__(self, args):
parser = argparse.ArgumentParser()
parser.add_argument("--destination", help="Destination directory", required=True)
self.args = parser.parse_args(args=args)
# copy the default files to file.raw in destination directory
self.known_formats = ['xml', 'properties', 'yaml', 'yml', 'env', "sh", "cfg", 'conf']
self.output_dir = self.args.destination
self.excluded_envs = ['HADOOP_CONF_DIR']
self.configurables = {}
def destination_file_path(self, name, extension):
"""destination file path"""
return os.path.join(self.output_dir, "{}.{}".format(name, extension))
def write_env_var(self, name, extension, key, value):
"""Write environment variables"""
with open(self.destination_file_path(name, extension) + ".raw", "a") as myfile:
myfile.write("{}: {}\n".format(key, value))
def process_envs(self):
"""Process environment variables"""
for key in os.environ.keys():
if key in self.excluded_envs:
continue
pattern = re.compile("[_\\.]")
parts = pattern.split(key)
extension = None
name = parts[0].lower()
if len(parts) > 1:
extension = parts[1].lower()
config_key = key[len(name) + len(extension) + 2:].strip()
if extension and "!" in extension:
splitted = extension.split("!")
extension = splitted[0]
fmt = splitted[1]
config_key = key[len(name) + len(extension) + len(fmt) + 3:].strip()
else:
fmt = extension
if extension and extension in self.known_formats:
if name not in self.configurables.keys():
with open(self.destination_file_path(name, extension) + ".raw", "w") as myfile:
myfile.write("")
self.configurables[name] = (extension, fmt)
self.write_env_var(name, extension, config_key, os.environ[key])
else:
for configurable_name in self.configurables:
if key.lower().startswith(configurable_name.lower()):
self.write_env_var(configurable_name,
self.configurables[configurable_name],
key[len(configurable_name) + 1:],
os.environ[key])
def transform(self):
"""transform"""
for configurable_name in self.configurables:
name = configurable_name
extension, fmt = self.configurables[name]
destination_path = self.destination_file_path(name, extension)
with open(destination_path + ".raw", "r") as myfile:
content = myfile.read()
transformer_func = getattr(transformation, "to_" + fmt)
content = transformer_func(content)
with open(destination_path, "w") as myfile:
myfile.write(content)
def main(self):
"""main"""
# add the
self.process_envs()
# copy file.ext.raw to file.ext in the destination directory, and
# transform to the right format (eg. key: value ===> XML)
self.transform()
def main():
"""main"""
Simple(sys.argv[1:]).main()
if __name__ == '__main__':
Simple(sys.argv[1:]).main()
文件krb5.conf
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
[logging]
default = FILE:/var/log/krb5libs.log
kdc = FILE:/var/log/krb5kdc.log
admin_server = FILE:/var/log/kadmind.log
[libdefaults]
dns_canonicalize_hostname = false
dns_lookup_realm = false
ticket_lifetime = 24h
renew_lifetime = 7d
forwardable = true
rdns = false
default_realm = EXAMPLE.COM
[realms]
EXAMPLE.COM = {
kdc = SERVER
admin_server = SERVER
}
[domain_realm]
.example.com = EXAMPLE.COM
example.com = EXAMPLE.COM
文件starter.sh
#!/usr/bin/env bash
set -e
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
if [ -n "$SLEEP_SECONDS" ]; then
echo "Sleeping for $SLEEP_SECONDS seconds"
sleep $SLEEP_SECONDS
fi
if [ ! -z "$WAITFOR" ]; then
echo "Waiting for the service $WAITFOR"
WAITFOR_HOST=$(printf "%s\n" "$WAITFOR"| cut -d : -f 1)
WAITFOR_PORT=$(printf "%s\n" "$WAITFOR"| cut -d : -f 2)
for i in `seq ${WAITFOR_TIMEOUT:-300} -1 0` ; do
set +e
nc -z "$WAITFOR_HOST" "$WAITFOR_PORT" > /dev/null 2>&1
result=$?
set -e
if [ $result -eq 0 ] ; then
break
fi
sleep 1
done
if [ "$i" -eq 0 ]; then
echo "Waiting for service $WAITFOR is timed out." >&2
exit 1
f
fi
fi
if [ -n "$KERBEROS_ENABLED" ]; then
echo "Setting up kerberos!!"
KERBEROS_SERVER=${KERBEROS_SERVER:-krb5}
ISSUER_SERVER=${ISSUER_SERVER:-$KERBEROS_SERVER\:8081}
echo "KDC ISSUER_SERVER => $ISSUER_SERVER"
if [ -n "$SLEEP_SECONDS" ]; then
echo "Sleeping for $(SLEEP_SECONDS) seconds"
sleep "$SLEEP_SECONDS"
fi
if [ -z "$KEYTAB_DIR" ]; then
KEYTAB_DIR='/etc/security/keytabs'
fi
while true
do
set +e
STATUS=$(curl -s -o /dev/null -w '%{http_code}' http://"$ISSUER_SERVER"/keytab/test/test)
set -e
if [ "$STATUS" -eq 200 ]; then
echo "Got 200, KDC service ready!!"
break
else
echo "Got $STATUS :( KDC service not ready yet..."
fi
sleep 5
done
HOST_NAME=$(hostname -f)
export HOST_NAME
for NAME in ${KERBEROS_KEYTABS}; do
echo "Download $NAME/$HOSTNAME@EXAMPLE.COM keytab file to $KEYTAB_DIR/$NAME.keytab"
wget "http://$ISSUER_SERVER/keytab/$HOST_NAME/$NAME" -O "$KEYTAB_DIR/$NAME.keytab"
klist -kt "$KEYTAB_DIR/$NAME.keytab"
KERBEROS_ENABLED=true
done
sed "s/SERVER/$KERBEROS_SERVER/g" "$DIR"/krb5.conf | sudo tee /etc/krb5.conf
fi
#To avoid docker volume permission problems
sudo chmod o+rwx /data
"$DIR"/envtoconf.py --destination "${HADOOP_CONF_DIR:-/opt/hadoop/etc/hadoop}"
if [ -n "$ENSURE_NAMENODE_DIR" ]; then
CLUSTERID_OPTS=""
if [ -n "$ENSURE_NAMENODE_CLUSTERID" ]; then
CLUSTERID_OPTS="-clusterid $ENSURE_NAMENODE_CLUSTERID"
fi
if [ ! -d "$ENSURE_NAMENODE_DIR" ]; then
/opt/hadoop/bin/hdfs namenode -format -force "$CLUSTERID_OPTS"
fi
fi
if [ -n "$ENSURE_STANDBY_NAMENODE_DIR" ]; then
if [ ! -d "$ENSURE_STANDBY_NAMENODE_DIR" ]; then
/opt/hadoop/bin/hdfs namenode -bootstrapStandby
fi
fi
if [ -n "$ENSURE_SCM_INITIALIZED" ]; then
if [ ! -f "$ENSURE_SCM_INITIALIZED" ]; then
# Improve om and scm start up options
/opt/hadoop/bin/ozone scm --init || /opt/hadoop/bin/ozone scm -init
fi
fi
if [ -n "$ENSURE_OM_INITIALIZED" ]; then
if [ ! -f "$ENSURE_OM_INITIALIZED" ]; then
# Improve om and scm start up options
/opt/hadoop/bin/ozone om --init || /opt/hadoop/bin/ozone om -createObjectStore
fi
fi
# Supports byteman script to instrument hadoop process with byteman script
#
#
if [ -n "$BYTEMAN_SCRIPT" ] || [ -n "$BYTEMAN_SCRIPT_URL" ]; then
export PATH=$PATH:$BYTEMAN_DIR/bin
if [ ! -z "$BYTEMAN_SCRIPT_URL" ]; then
sudo wget $BYTEMAN_SCRIPT_URL -O /tmp/byteman.btm
export BYTEMAN_SCRIPT=/tmp/byteman.btm
fi
if [ ! -f "$BYTEMAN_SCRIPT" ]; then
echo "ERROR: The defined $BYTEMAN_SCRIPT does not exist!!!"
exit -1
fi
AGENT_STRING="-javaagent:/opt/byteman.jar=script:$BYTEMAN_SCRIPT"
export HADOOP_OPTS="$AGENT_STRING $HADOOP_OPTS"
echo "Process is instrumented with adding $AGENT_STRING to HADOOP_OPTS"
fi
"$@"
文件transformation.py
#!/usr/bin/python
"""This module transform properties into different format"""
def render_yaml(yaml_root, prefix=""):
"""render yaml"""
result = ""
if isinstance(yaml_root, dict):
if prefix:
result += "\n"
for key in yaml_root:
result += "{}{}: {}".format(prefix, key, render_yaml(
yaml_root[key], prefix + " "))
elif isinstance(yaml_root, list):
result += "\n"
for item in yaml_root:
result += prefix + " - " + render_yaml(item, prefix + " ")
else:
result += "{}\n".format(yaml_root)
return result
def to_yaml(content):
"""transform to yaml"""
props = process_properties(content)
keys = props.keys()
yaml_props = {}
for key in keys:
parts = key.split(".")
node = yaml_props
prev_part = None
parent_node = {}
for part in parts[:-1]:
if part.isdigit():
if isinstance(node, dict):
parent_node[prev_part] = []
node = parent_node[prev_part]
while len(node) <= int(part):
node.append({})
parent_node = node
node = node[int(node)]
else:
if part not in node:
node[part] = {}
parent_node = node
node = node[part]
prev_part = part
if parts[-1].isdigit():
if isinstance(node, dict):
parent_node[prev_part] = []
node = parent_node[prev_part]
node.append(props[key])
else:
node[parts[-1]] = props[key]
return render_yaml(yaml_props)
def to_yml(content):
"""transform to yml"""
return to_yaml(content)
def to_properties(content):
"""transform to properties"""
result = ""
props = process_properties(content)
for key, val in props.items():
result += "{}: {}\n".format(key, val)
return result
def to_env(content):
"""transform to environment variables"""
result = ""
props = process_properties(content)
for key, val in props:
result += "{}={}\n".format(key, val)
return result
def to_sh(content):
"""transform to shell"""
result = ""
props = process_properties(content)
for key, val in props:
result += "export {}=\"{}\"\n".format(key, val)
return result
def to_cfg(content):
"""transform to config"""
result = ""
props = process_properties(content)
for key, val in props:
result += "{}={}\n".format(key, val)
return result
def to_conf(content):
"""transform to configuration"""
result = ""
props = process_properties(content)
for key, val in props:
result += "export {}={}\n".format(key, val)
return result
def to_xml(content):
"""transform to xml"""
result = "<configuration>\n"
props = process_properties(content)
for key in props:
result += "<property><name>{0}</name><value>{1}</value></property>\n". \
format(key, props[key])
result += "</configuration>"
return result
def process_properties(content, sep=': ', comment_char='#'):
"""
Read the file passed as parameter as a properties file.
"""
props = {}
for line in content.split("\n"):
sline = line.strip()
if sline and not sline.startswith(comment_char):
key_value = sline.split(sep)
key = key_value[0].strip()
value = sep.join(key_value[1:]).strip().strip('"')
props[key] = value
return props
构建镜像
docker build -t apache/hadoop-runner -f Dockerfile .
构建Hadoop镜像
参考文档
在目录hadoop中准备以下文件
文件log4j.properties,可选,用于让docker logs和kubectl logs可以直接看到容器日志。
log4j.appender.stdout.layout: org.apache.log4j.PatternLayout
log4j.rootLogger: INFO, stdout
log4j.appender.stdout: org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout.ConversionPattern: %d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
文件Dockerfile
重点配置:HADOOP_VERSION=改为你使用的Hadoop版本
FROM apache/hadoop-runner
WORKDIR /opt
ARG HADOOP_VERSION=3.1.1
COPY hadoop-${HADOOP_VERSION}.tar.gz /opt/hadoop.tar.gz
RUN sudo rm -rf /opt/hadoop \
&& tar zxf hadoop.tar.gz \
&& rm hadoop.tar.gz \
&& mv hadoop* hadoop \
&& rm -rf /opt/hadoop/share/doc
WORKDIR /opt/hadoop
ADD log4j.properties /opt/hadoop/etc/hadoop/log4j.properties
RUN sudo chown -R hadoop:users /opt/hadoop/etc/hadoop/*
ENV HADOOP_CONF_DIR=/opt/hadoop/etc/hadoop
下载Hadoop文件,自查版本清单,选择需要的版本。例如v3.1.1,https://archive.apache.org/dist/hadoop/common/hadoop-3.1.1/hadoop-3.1.1.tar.gz
构建Hadoop镜像
docker build -t hadoop:3.1.1 .