Azhop e2e deployment

Azure HPC On-Demand Platform (az-hop) is a tool that provides an end-to-end deployment mechanism for a base HPC infrastructure on Azure. It uses industry standard tools like Terraform, Ansible and Packer to provision and configure a complete HPC cluster solution that is ready for users to run applications. It also includes features such as an HPC OnDemand Portal, an Active Directory, a Job Scheduler, dynamic resources provisioning and autoscaling, a Jumpbox, and various storage options ref.

Clone the repo

git clone --recursive https://github.com/Azure/az-hop.git

Create the config.yml file

---
project_name: az-hop
location: eastus
resource_group: JZ-azhop_v2
use_existing_rg: false

tags:
  env: dev
  project: azhop

log_analytics:
  create: false

monitoring:
  install_agent: false

alerting:
  enabled: false
  admin_email: email@email.com
  local_volume_threshold: 80

anf:
  create: false
  homefs_size_tb: 4
  homefs_service_level: Standard
  dual_protocol: false # true to enable SMB support. false by default
  alert_threshold: 80 # alert when ANF volume reaches this threshold

azurefiles:
  create: true
  size_gb: 1024

mounts:
  home: # This home name can't be changed
    type: azurefiles # anf or azurefiles, default to anf. One of the two should be defined in order to mount the home directory
    mountpoint: /anfhome # /sharedhome for example
    server: '' # Specify an existing NFS server name or IP, when using the ANF built in use ''
    export: '' # Specify an existing NFS export directory, when using the ANF built in use ''
    options: 'vers=4,minorversion=1,sec=sys' #'' # Specify the mount options. Default to rw,hard,rsize=262144,wsize=262144,vers=3,tcp,_netdev

admin_user: hpcadmin

network:
  create_nsg: true
  vnet:
    name: hpcvnet # Optional - default to hpcvnet
    address_space: "10.101.0.0/23"
    subnets: # all subnets are optionals
      frontend:
        name: frontend
        address_prefixes: "10.101.0.0/29"
        create: true # create the subnet if true. default to true when not specified, default to false if using an existing VNET when not specified
      admin:
        name: admin
        address_prefixes: "10.101.0.16/28"
        create: true
      ad:
        name: ad
        address_prefixes: "10.101.0.8/29"
        create: true
      netapp:
        name: netapp
        address_prefixes: "10.101.0.32/28"
        create: true
      compute:
        name: compute
        address_prefixes: "10.101.1.0/24"
        create: true

locked_down_network:
  enforce: false
  public_ip: true # Enable public IP creation for Jumpbox, OnDemand and create images. Default to true

linux_base_image: "OpenLogic:CentOS:7_9-gen2:latest"
windows_base_image: "MicrosoftWindowsServer:WindowsServer:2019-Datacenter-smalldisk:latest" # publisher:offer:sku:version or image_id

deployer:
  vm_size: Standard_B2ms
ad:
  vm_size: Standard_B2ms
ondemand:
  vm_size: Standard_D4s_v5
  generate_certificate: true # Generate an SSL certificate for the OnDemand portal. Default to true
grafana:
  vm_size: Standard_B2ms
guacamole:
  vm_size: Standard_B2ms
scheduler:
  vm_size: Standard_B2ms
cyclecloud:
  vm_size: Standard_B2ms

users:
  - { name: hpcuser,   uid: 10001 }
  - { name: adminuser, uid: 10002, groups: [5001, 5002] }
  - { name: john.john,   uid: 10003 }

usergroups:
  - name: Domain Users # All users will be added to this one by default
    gid: 5000
  - name: az-hop-admins
    gid: 5001
    description: "For users with azhop admin privileges"
  - name: az-hop-localadmins
    gid: 5002
    description: "For users with sudo right or local admin right on nodes"

cvmfs_eessi:
  enabled: false

queue_manager: slurm

slurm:
  accounting_enabled: false
  slurm_version: 20.11.9

enroot:
  enroot_version: 3.4.1

database:
  user: sqladmin

bastion:
  create: false

vpn_gateway:
  create: false

authentication:
  httpd_auth: basic # oidc or basic

autoscale:
  idle_timeout: 1800 # Idle time in seconds before shutting down VMs - default to 1800 like in CycleCloud

queues:
  - name: execute
    vm_size: Standard_F2s_v2
    max_core_count: 20
    image: azhpc:azhop-compute:ubuntu-2004:latest
    spot: false
    ColocateNodes: false

enable_remote_winviz: false # Set to true to enable windows remote visualization

remoteviz:
  - name: winviz # This name is fixed and can't be changed
    vm_size: Standard_NV12s_v3 # Standard_NV8as_v4 Only NVsv3 and NVsV4 are supported
    max_core_count: 48
    image: "MicrosoftWindowsDesktop:Windows-10:21h1-pron:latest"
    ColocateNodes: false
    spot: false
    EnableAcceleratedNetworking: false

applications:
  bc_codeserver:
    enabled: true
  bc_jupyter:
    enabled: true
  bc_amlsdk:
    enabled: false
  bc_rstudio:
    enabled: true
  bc_ansys_workbench:
    enabled: false
  bc_vmd:
    enabled: false
  bc_paraview:
    enabled: false
  bc_vizer:
    enabled: false

Install dependences

sudo ./toolset/scripts/install.sh

Build the backbone using bicep

$ ./build.sh -a apply -l bicep

Find the deployer VM ip from the Azure portal. Connect to the deployer VM

$ ssh -i hpcadmin_id_rsa hpcadmin@20.231.50.26
The authenticity of host '20.231.50.26 (20.231.50.26)' can't be established.
ECDSA key fingerprint is SHA256:lPf4I4nZmZ7hzuxif9RZOVdMmGC6zMvSylTE79Tapwk.
Are you sure you want to continue connecting (yes/no/[fingerprint])? yes
Warning: Permanently added '20.231.50.26' (ECDSA) to the list of known hosts.
Welcome to Ubuntu 20.04.6 LTS (GNU/Linux 5.15.0-1036-azure x86_64)

Monitor the ansible installation

hpcadmin@deployer:~$ sudo -i
root@deployer:~# cd /var/log/
root@deployer:/var/log# ls
apt       azure  chrony                 cloud-init.log  dmesg     journal   landscape  private  ubuntu-advantage.log  waagent.log
auth.log  btmp   cloud-init-output.log  dist-upgrade    dpkg.log  kern.log  lastlog    syslog   unattended-upgrades   wtmp
root@deployer:/var/log# tail -f cloud-init-output.log

The end of a successful deployment:

PLAY RECAP *********************************************************************
ccportal                   : ok=3    changed=2    unreachable=0    failed=0    skipped=1    rescued=0    ignored=0
grafana                    : ok=3    changed=2    unreachable=0    failed=0    skipped=1    rescued=0    ignored=0
ondemand                   : ok=3    changed=2    unreachable=0    failed=0    skipped=1    rescued=0    ignored=0
scheduler                  : ok=3    changed=2    unreachable=0    failed=0    skipped=1    rescued=0    ignored=0

Saturday 29 April 2023  03:58:01 +0000 (0:00:01.488)       0:00:03.239 ********
===============================================================================
chrony ------------------------------------------------------------------ 3.06s
include_role ------------------------------------------------------------ 0.11s
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
total ------------------------------------------------------------------- 3.17s
Command succeeded!
Cloud-init v. 23.1.1-0ubuntu0~20.04.1 running 'modules:final' at Sat, 29 Apr 2023 03:16:47 +0000. Up 27.90 seconds.
Cloud-init v. 23.1.1-0ubuntu0~20.04.1 finished at Sat, 29 Apr 2023 03:58:01 +0000. Datasource DataSourceAzure [seed=/dev/sr0].  Up 2501.89 seconds

Get the FQDN, and username/password:

root@deployer:/az-hop# cd /az-hop/
root@deployer:/az-hop# pwd
/az-hop
root@deployer:/az-hop# grep ondemand_fqdn playbooks/group_vars/all.yml
ondemand_fqdn: ondemandmxsmmtrkr6ehsx.eastus.cloudapp.azure.com
root@deployer:/az-hop# ./bin/get_secret john.john
j5hTzIyBXqVExNpB35pJGVra5sM=

Updated: