Commit 7a9373bd authored by Dietmar Maurer's avatar Dietmar Maurer

add HA resource agent

parent 45dbd87e
include ../defines.mk
SUBDIRS = init.d cron test
SUBDIRS = init.d cron ocf test
SCRIPTS = \
vzdump \
......
include ../../defines.mk
all:
SCRIPTS = pvevm
.PHONY: install
install: ${SCRIPTS}
install -d ${HARADIR}
install -m 0755 ${SCRIPTS} ${HARADIR}
.PHONY: distclean
distclean: clean
.PHONY: clean
clean:
rm -rf *~
#!/usr/bin/perl -w
# Resource Agent for managing PVE VMs (openvz and qemu-kvm)
#
# License: GNU Affero General Public License (AGPL3)
# Copyright (C) 2011 Proxmox Server Solutions GmbH
use strict;
use PVE::Tools;
use PVE::ProcFSTools;
use PVE::Cluster;
use PVE::INotify;
use PVE::RPCEnvironment;
use PVE::OpenVZ;
use PVE::API2::OpenVZ;
use PVE::QemuServer;
use PVE::API2::Qemu;
use constant OCF_SUCCESS => 0;
use constant OCF_ERR_GENERIC => 1;
use constant OCF_ERR_ARGS => 2;
use constant OCF_ERR_UNIMPLEMENTED => 3;
use constant OCF_ERR_PERM => 4;
use constant OCF_ERR_INSTALLED => 5;
use constant OCF_ERR_CONFIGURED => 6;
use constant OCF_NOT_RUNNING => 7;
use constant OCF_RUNNING_MASTER => 8;
use constant OCF_FAILED_MASTER => 9;
$ENV{'PATH'} = '/sbin:/bin:/usr/sbin:/usr/bin';
$SIG{__DIE__} = sub {
die @_ if $^S; # skip if inside eval
$! = OCF_ERR_GENERIC;
};
if ($> != 0) {
print STDERR "Cannot control VMs. as non-root user.\n";
exit(OCF_ERR_PERM);
}
PVE::INotify::inotify_init();
my $rpcenv = PVE::RPCEnvironment->init('ha');
$rpcenv->init_request();
$rpcenv->set_language($ENV{LANG});
$rpcenv->set_user('root@pam');
my $nodename = PVE::INotify::nodename();
my @ssh_opts = ('-o', 'BatchMode=yes');
my @ssh_cmd = ('ssh', @ssh_opts);
sub ocf_log {
my ($level, $msg) = @_;
# fixme:
chomp $msg;
print "$level: $msg\n";
}
sub check_running {
my ($status, $verbose) = @_;
if ($status->{type} eq 'qemu') {
$status->{running} = PVE::QemuServer::check_running($status->{vmid}, 1);
} elsif ($status->{type} eq 'openvz') {
$status->{running} = PVE::OpenVZ::check_running($status->{vmid});
} else {
die "got strange VM type '$status->{type}'\n";
}
}
sub validate_all {
my $status = {};
eval {
my $vmid = $ENV{OCF_RESKEY_vmid};
die "no VMID specified\n" if !defined($vmid);
die "got invalid VMID '$vmid'\n" if $vmid !~ m/^[1-9]\d*$/;
my $vmlist = PVE::Cluster::get_vmlist();
die "got empty cluster VM list\n" if !$vmlist || !$vmlist->{ids};
my $data = $vmlist->{ids}->{$vmid};
die "VM $vmid does not exist\n" if !$data;
$status->{vmid} = $vmid;
$status->{type} = $data->{type};
$status->{node} = $data->{node};
ocf_log('debug', "VM $vmid ($status->{type}) on node $status->{node}\n");
check_running($status);
};
if (my $err = $@) {
ocf_log('err', $err);
exit(OCF_ERR_ARGS);
}
return $status;
}
sub upid_wait {
my ($upid) = @_;
my $task = PVE::Tools::upid_decode($upid);
sleep(1);
while (PVE::ProcFSTools::check_process_running($task->{pid}, $task->{pstart})) {
ocf_log('debug', "Task still active, waiting");
sleep(1);
}
}
my $cmd = shift || '';
my $migratetarget = shift if $cmd eq 'migrate';
die "too many arguments\n" if scalar (@ARGV) != 0;
if ($cmd eq 'start') {
my $status = validate_all();
if ($status->{running}) {
ocf_log('info', "Resource is already running");
exit(OCF_SUCCESS);
}
if ($status->{node} ne $nodename) {
ocf_log('info', "Move config to local node");
my ($oldconfig, $newconfig);
if ($status->{type} eq 'qemu') {
$oldconfig = PVE::QemuServer::config_file($status->{vmid}, $status->{node});
$newconfig = PVE::QemuServer::config_file($status->{vmid}, $nodename);
} else {
$oldconfig = PVE::OpenVZ::config_file($status->{vmid}, $status->{node});
$newconfig = PVE::OpenVZ::config_file($status->{vmid}, $nodename);
}
if (!rename($oldconfig, $newconfig)) {
ocf_log('err', "unable to move config file from '$oldconfig' to '$newconfig' - $!");
exit(OCF_ERR_GENERIC);
}
}
my $upid;
if ($status->{type} eq 'qemu') {
$upid = PVE::API2::Qemu->vm_start({node => $nodename, vmid => $status->{vmid}});
} else {
$upid = PVE::API2::OpenVZ->vm_start({node => $nodename, vmid => $status->{vmid}});
}
upid_wait($upid);
check_running($status);
exit($status->{running} ? OCF_SUCCESS : OCF_ERR_GENERIC);
} elsif($cmd eq 'stop') {
my $status = validate_all();
if (!$status->{running}) {
ocf_log('info', "Resource is already stopped");
exit(OCF_SUCCESS);
}
my $upid;
if ($status->{type} eq 'qemu') {
$upid = PVE::API2::Qemu->vm_stop({node => $nodename, vmid => $status->{vmid}});
} else {
$upid = PVE::API2::OpenVZ->vm_stop({node => $nodename, vmid => $status->{vmid}, fast => 1});
}
upid_wait($upid);
check_running($status);
exit($status->{running} ? OCF_ERR_GENERIC : OCF_SUCCESS);
} elsif($cmd eq 'recover' || $cmd eq 'restart' || $cmd eq 'reload') {
exit(OCF_SUCCESS);
} elsif($cmd eq 'status' || $cmd eq 'monitor') {
my $status = validate_all();
if ($status->{running}) {
ocf_log('debug', "Resource is running");
exit(OCF_SUCCESS);
} else {
ocf_log('debug', "Resource is not running");
exit(OCF_NOT_RUNNING);
}
} elsif($cmd eq 'migrate') {
my $status = validate_all();
if (!$status->{running}) {
ocf_log('err', "Resource is not running");
exit(OCF_ERR_GENERIC);
}
if (!$migratetarget) {
ocf_log('err', "No target specified");
exit(OCF_ERR_ARGS);
};
# test ssh connection and try to detect node name
my @rem_ssh = (@ssh_cmd, "root\@$migratetarget");
my $cmd = [ @rem_ssh, '/bin/hostname' ];
my $targetnode = '';
eval {
PVE::Tools::run_command($cmd, outfunc => sub {
$targetnode = shift if !$targetnode;
});
};
if (my $err = $@) {
ocf_log('err', "can't connect to target '$migratetarget' - $err");
exit(OCF_ERR_GENERIC);
}
if (!PVE::Cluster::check_node_exists($targetnode, 1)) {
ocf_log('err', "target hostname '$targetnode' is no cluster member");
exit(OCF_ERR_GENERIC);
}
my $upid;
my $params = {
node => $nodename,
vmid => $status->{vmid},
target => $targetnode,
online => 1,
};
my $oldconfig;
if ($status->{type} eq 'qemu') {
$oldconfig = PVE::QemuServer::config_file($status->{vmid}, $status->{node});
$upid = PVE::API2::Qemu->migrate_vm($params);
} else {
$oldconfig = PVE::OpenVZ::config_file($status->{vmid}, $status->{node});
$upid = PVE::API2::OpenVZ->migrate_vm($params);
}
upid_wait($upid);
# something went wrong if old config file is still there
exit((-f $oldconfig) ? OCF_ERR_GENERIC : OCF_SUCCESS);
} elsif($cmd eq 'stop') {
my $status = validate_all();
if (!$status->{running}) {
ocf_log('info', "Resource is already stopped");
exit(OCF_SUCCESS);
}
my $upid;
if ($status->{type} eq 'qemu') {
$upid = PVE::API2::Qemu->vm_stop({node => $nodename, vmid => $status->{vmid}});
} else {
$upid = PVE::API2::OpenVZ->vm_stop({node => $nodename, vmid => $status->{vmid}, fast => 1});
}
upid_wait($upid);
die "implement me";
} elsif($cmd eq 'reconfig') {
# Reconfigure a running VM
my $status = validate_all();
# we do nothing here
} elsif($cmd eq 'meta-data') {
while(<DATA>) {
print;
}
} elsif($cmd eq 'validate-all') {
my $status = validate_all();
} else {
die "usage: $0 {start|stop|restart|status|reload|reconfig|meta-data|validate-all}\n";
}
exit(OCF_SUCCESS);
__DATA__
<?xml version="1.0"?>
<resource-agent version="rgmanager 2.0" name="pvevm">
<version>1.0</version>
<longdesc lang="en">
Defines a PVE Virtual Machine
</longdesc>
<shortdesc lang="en">
Defines a PVE Virtual Machine
</shortdesc>
<parameters>
<parameter name="vmid" primary="1">
<longdesc lang="en">
This is the VMID of the virtual machine.
</longdesc>
<shortdesc lang="en">
VMID
</shortdesc>
<content type="string"/>
</parameter>
<parameter name="domain" reconfig="1">
<longdesc lang="en">
Failover domains define lists of cluster members
to try in the event that the host of the virtual machine
fails.
</longdesc>
<shortdesc lang="en">
Cluster failover Domain
</shortdesc>
<content type="string"/>
</parameter>
<parameter name="autostart" reconfig="1">
<longdesc lang="en">
If set to yes, this resource group will automatically be started
after the cluster forms a quorum. If set to no, this virtual
machine will start in the 'disabled' state after the cluster
forms a quorum.
</longdesc>
<shortdesc lang="en">
Automatic start after quorum formation
</shortdesc>
<content type="boolean" default="1"/>
</parameter>
<parameter name="exclusive" reconfig="1">
<longdesc lang="en">
If set, this resource group will only relocate to
nodes which have no other resource groups running in the
event of a failure. If no empty nodes are available,
this resource group will not be restarted after a failure.
Additionally, resource groups will not automatically
relocate to the node running this resource group. This
option can be overridden by manual start and/or relocate
operations.
</longdesc>
<shortdesc lang="en">
Exclusive resource group
</shortdesc>
<content type="boolean" default="0"/>
</parameter>
<parameter name="recovery" reconfig="1">
<longdesc lang="en">
This currently has three possible options: "restart" tries
to restart this virtual machine locally before
attempting to relocate (default); "relocate" does not bother
trying to restart the VM locally; "disable" disables
the VM if it fails.
</longdesc>
<shortdesc lang="en">
Failure recovery policy
</shortdesc>
<content type="string"/>
</parameter>
<parameter name="depend">
<longdesc lang="en">
Service dependency; will not start without the specified
service running.
</longdesc>
<shortdesc lang="en">
Top-level service this depends on, in service:name format.
</shortdesc>
<content type="string"/>
</parameter>
<parameter name="depend_mode">
<longdesc lang="en">
Service dependency mode.
hard - This service is stopped/started if its dependency
is stopped/started
soft - This service only depends on the other service for
initial startip. If the other service stops, this
service is not stopped.
</longdesc>
<shortdesc lang="en">
Service dependency mode (soft or hard).
</shortdesc>
<content type="string" default="hard"/>
</parameter>
<parameter name="max_restarts" reconfig="1">
<longdesc lang="en">
Maximum restarts for this service.
</longdesc>
<shortdesc lang="en">
Maximum restarts for this service.
</shortdesc>
<content type="string" default="0"/>
</parameter>
<parameter name="restart_expire_time" reconfig="1">
<longdesc lang="en">
Restart expiration time. A restart is forgotten
after this time. When combined with the max_restarts
option, this lets administrators specify a threshold
for when to fail over services. If max_restarts
is exceeded in this given expiration time, the service
is relocated instead of restarted again.
</longdesc>
<shortdesc lang="en">
Restart expiration time; amount of time before a restart
is forgotten.
</shortdesc>
<content type="string" default="0"/>
</parameter>
<parameter name="status_program" reconfig="1">
<longdesc lang="en">
Ordinarily, only the presence/health of a virtual machine
is checked. If specified, the status_program value is
executed during a depth 10 check. The intent of this
program is to ascertain the status of critical services
within a virtual machine.
</longdesc>
<shortdesc lang="en">
Additional status check program
</shortdesc>
<content type="string" default=""/>
</parameter>
</parameters>
<actions>
<action name="start" timeout="75"/>
<action name="stop" timeout="75"/>
<action name="status" timeout="10" interval="30"/>
<action name="monitor" timeout="10" interval="30"/>
<!-- depth 10 calls the status_program -->
<action name="status" depth="10" timeout="20" interval="60"/>
<action name="monitor" depth="10" timeout="20" interval="60"/>
<!-- reconfigure - reconfigure with new OCF parameters.
NOT OCF COMPATIBLE AT ALL -->
<action name="reconfig" timeout="10"/>
<action name="migrate" timeout="10m"/>
<action name="meta-data" timeout="5"/>
<action name="validate-all" timeout="5"/>
</actions>
<special tag="rgmanager">
<!-- Destroy_on_delete / init_on_add are currently only
supported for migratory resources (no children
and the 'migrate' action; see above. Do not try this
with normal services -->
<attributes maxinstances="1" destroy_on_delete="0" init_on_add="0"/>
</special>
</resource-agent>
pve-manager (2.0-15) unstable; urgency=low
* add HA resource agent
-- Proxmox Support Team <support@proxmox.com> Tue, 13 Dec 2011 10:18:20 +0100
pve-manager (2.0-14) unstable; urgency=low
* add Japanese translation (many thanks to Koichi!)
......
......@@ -2,13 +2,14 @@ RELEASE=2.0
VERSION=2.0
PACKAGE=pve-manager
PACKAGERELEASE=14
PACKAGERELEASE=15
BINDIR=${DESTDIR}/usr/bin
PERLLIBDIR=${DESTDIR}/usr/share/perl5
MAN1DIR=${DESTDIR}/usr/share/man/man1
CRONDAILYDIR=${DESTDIR}/etc/cron.daily
INITDBINDIR=${DESTDIR}/etc/init.d/
INITDBINDIR=${DESTDIR}/etc/init.d
HARADIR=${DESTDIR}/usr/share/cluster
DOCDIR=${DESTDIR}/usr/share/doc/${PACKAGE}
PODDIR=${DESTDIR}/usr/share/doc/${PACKAGE}/pod
WWWBASEDIR=${DESTDIR}/usr/share/${PACKAGE}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment