472e165b56
Slurmdbd requires a password database which is stored in slurmdbd.conf. A seperate config file avoids that the password ends up in the nix store. Slurmdbd does 19.5 does not support MySQL socket conections. Adapated the slurm test to provide username and password.
148 lines
4.1 KiB
Nix
148 lines
4.1 KiB
Nix
import ./make-test.nix ({ lib, ... }:
|
||
let
|
||
mungekey = "mungeverryweakkeybuteasytointegratoinatest";
|
||
|
||
slurmconfig = {
|
||
controlMachine = "control";
|
||
nodeName = [ "node[1-3] CPUs=1 State=UNKNOWN" ];
|
||
partitionName = [ "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP" ];
|
||
extraConfig = ''
|
||
AccountingStorageHost=dbd
|
||
AccountingStorageType=accounting_storage/slurmdbd
|
||
'';
|
||
};
|
||
in {
|
||
name = "slurm";
|
||
|
||
meta.maintainers = [ lib.maintainers.markuskowa ];
|
||
|
||
nodes =
|
||
let
|
||
computeNode =
|
||
{ ...}:
|
||
{
|
||
# TODO slurmd port and slurmctld port should be configurations and
|
||
# automatically allowed by the firewall.
|
||
networking.firewall.enable = false;
|
||
services.slurm = {
|
||
client.enable = true;
|
||
} // slurmconfig;
|
||
};
|
||
in {
|
||
|
||
control =
|
||
{ ...}:
|
||
{
|
||
networking.firewall.enable = false;
|
||
services.slurm = {
|
||
server.enable = true;
|
||
} // slurmconfig;
|
||
};
|
||
|
||
submit =
|
||
{ ...}:
|
||
{
|
||
networking.firewall.enable = false;
|
||
services.slurm = {
|
||
enableStools = true;
|
||
} // slurmconfig;
|
||
};
|
||
|
||
dbd =
|
||
{ pkgs, ... } :
|
||
{
|
||
networking.firewall.enable = false;
|
||
services.slurm.dbdserver = {
|
||
enable = true;
|
||
storagePass = "password123";
|
||
};
|
||
services.mysql = {
|
||
enable = true;
|
||
package = pkgs.mariadb;
|
||
initialScript = pkgs.writeText "mysql-init.sql" ''
|
||
CREATE USER 'slurm'@'localhost' IDENTIFIED BY 'password123';
|
||
GRANT ALL PRIVILEGES ON slurm_acct_db.* TO 'slurm'@'localhost';
|
||
'';
|
||
ensureDatabases = [ "slurm_acct_db" ];
|
||
ensureUsers = [{
|
||
ensurePermissions = { "slurm_acct_db.*" = "ALL PRIVILEGES"; };
|
||
name = "slurm";
|
||
}];
|
||
extraOptions = ''
|
||
# recommendations from: https://slurm.schedmd.com/accounting.html#mysql-configuration
|
||
innodb_buffer_pool_size=1024M
|
||
innodb_log_file_size=64M
|
||
innodb_lock_wait_timeout=900
|
||
'';
|
||
};
|
||
};
|
||
|
||
node1 = computeNode;
|
||
node2 = computeNode;
|
||
node3 = computeNode;
|
||
};
|
||
|
||
|
||
testScript =
|
||
''
|
||
startAll;
|
||
|
||
# Set up authentification across the cluster
|
||
foreach my $node (($submit,$control,$dbd,$node1,$node2,$node3))
|
||
{
|
||
$node->waitForUnit("default.target");
|
||
|
||
$node->succeed("mkdir /etc/munge");
|
||
$node->succeed("echo '${mungekey}' > /etc/munge/munge.key");
|
||
$node->succeed("chmod 0400 /etc/munge/munge.key");
|
||
$node->succeed("chown munge:munge /etc/munge/munge.key");
|
||
$node->succeed("systemctl restart munged");
|
||
|
||
$node->waitForUnit("munged");
|
||
};
|
||
|
||
# Restart the services since they have probably failed due to the munge init
|
||
# failure
|
||
subtest "can_start_slurmdbd", sub {
|
||
$dbd->succeed("systemctl restart slurmdbd");
|
||
$dbd->waitForUnit("slurmdbd.service");
|
||
$dbd->waitForOpenPort(6819);
|
||
};
|
||
|
||
# there needs to be an entry for the current
|
||
# cluster in the database before slurmctld is restarted
|
||
subtest "add_account", sub {
|
||
$control->succeed("sacctmgr -i add cluster default");
|
||
# check for cluster entry
|
||
$control->succeed("sacctmgr list cluster | awk '{ print \$1 }' | grep default");
|
||
};
|
||
|
||
subtest "can_start_slurmctld", sub {
|
||
$control->succeed("systemctl restart slurmctld");
|
||
$control->waitForUnit("slurmctld.service");
|
||
};
|
||
|
||
subtest "can_start_slurmd", sub {
|
||
foreach my $node (($node1,$node2,$node3))
|
||
{
|
||
$node->succeed("systemctl restart slurmd.service");
|
||
$node->waitForUnit("slurmd");
|
||
}
|
||
};
|
||
|
||
# Test that the cluster works and can distribute jobs;
|
||
|
||
subtest "run_distributed_command", sub {
|
||
# Run `hostname` on 3 nodes of the partition (so on all the 3 nodes).
|
||
# The output must contain the 3 different names
|
||
$submit->succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq");
|
||
};
|
||
|
||
subtest "check_slurm_dbd", sub {
|
||
# find the srun job from above in the database
|
||
sleep 5;
|
||
$control->succeed("sacct | grep hostname");
|
||
};
|
||
'';
|
||
})
|