qemu/vma-reader.c
2013-03-26 06:21:16 +01:00

864 lines
23 KiB
C

/*
* VMA: Virtual Machine Archive
*
* Copyright (C) 2012 Proxmox Server Solutions
*
* Authors:
* Dietmar Maurer (dietmar@proxmox.com)
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
*
*/
#include <stdio.h>
#include <errno.h>
#include <unistd.h>
#include <stdio.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <glib.h>
#include <uuid/uuid.h>
#include "qemu-common.h"
#include "qemu/timer.h"
#include "qemu/ratelimit.h"
#include "vma.h"
#include "block/block.h"
#define BITS_PER_LONG (sizeof(unsigned long) * CHAR_BIT)
static unsigned char zero_vma_block[VMA_BLOCK_SIZE];
typedef struct VmaRestoreState {
BlockDriverState *bs;
bool write_zeroes;
unsigned long *bitmap;
int bitmap_size;
} VmaRestoreState;
struct VmaReader {
int fd;
GChecksum *md5csum;
GHashTable *blob_hash;
unsigned char *head_data;
VmaDeviceInfo devinfo[256];
VmaRestoreState rstate[256];
GList *cdata_list;
guint8 vmstate_stream;
uint32_t vmstate_clusters;
/* to show restore percentage if run with -v */
time_t start_time;
int64_t cluster_count;
int64_t clusters_read;
int64_t zero_cluster_data;
int64_t partial_zero_cluster_data;
int clusters_read_per;
};
static guint
g_int32_hash(gconstpointer v)
{
return *(const uint32_t *)v;
}
static gboolean
g_int32_equal(gconstpointer v1, gconstpointer v2)
{
return *((const uint32_t *)v1) == *((const uint32_t *)v2);
}
static int vma_reader_get_bitmap(VmaRestoreState *rstate, int64_t cluster_num)
{
assert(rstate);
assert(rstate->bitmap);
unsigned long val, idx, bit;
idx = cluster_num / BITS_PER_LONG;
assert(rstate->bitmap_size > idx);
bit = cluster_num % BITS_PER_LONG;
val = rstate->bitmap[idx];
return !!(val & (1UL << bit));
}
static void vma_reader_set_bitmap(VmaRestoreState *rstate, int64_t cluster_num,
int dirty)
{
assert(rstate);
assert(rstate->bitmap);
unsigned long val, idx, bit;
idx = cluster_num / BITS_PER_LONG;
assert(rstate->bitmap_size > idx);
bit = cluster_num % BITS_PER_LONG;
val = rstate->bitmap[idx];
if (dirty) {
if (!(val & (1UL << bit))) {
val |= 1UL << bit;
}
} else {
if (val & (1UL << bit)) {
val &= ~(1UL << bit);
}
}
rstate->bitmap[idx] = val;
}
typedef struct VmaBlob {
uint32_t start;
uint32_t len;
void *data;
} VmaBlob;
static const VmaBlob *get_header_blob(VmaReader *vmar, uint32_t pos)
{
assert(vmar);
assert(vmar->blob_hash);
return g_hash_table_lookup(vmar->blob_hash, &pos);
}
static const char *get_header_str(VmaReader *vmar, uint32_t pos)
{
const VmaBlob *blob = get_header_blob(vmar, pos);
if (!blob) {
return NULL;
}
const char *res = (char *)blob->data;
if (res[blob->len-1] != '\0') {
return NULL;
}
return res;
}
static ssize_t
safe_read(int fd, unsigned char *buf, size_t count)
{
ssize_t n;
do {
n = read(fd, buf, count);
} while (n < 0 && errno == EINTR);
return n;
}
static ssize_t
full_read(int fd, unsigned char *buf, size_t len)
{
ssize_t n;
size_t total;
total = 0;
while (len > 0) {
n = safe_read(fd, buf, len);
if (n == 0) {
return total;
}
if (n <= 0) {
break;
}
buf += n;
total += n;
len -= n;
}
if (len) {
return -1;
}
return total;
}
void vma_reader_destroy(VmaReader *vmar)
{
assert(vmar);
if (vmar->fd >= 0) {
close(vmar->fd);
}
if (vmar->cdata_list) {
g_list_free(vmar->cdata_list);
}
int i;
for (i = 1; i < 256; i++) {
if (vmar->rstate[i].bitmap) {
g_free(vmar->rstate[i].bitmap);
}
}
if (vmar->md5csum) {
g_checksum_free(vmar->md5csum);
}
if (vmar->blob_hash) {
g_hash_table_destroy(vmar->blob_hash);
}
if (vmar->head_data) {
g_free(vmar->head_data);
}
g_free(vmar);
};
static int vma_reader_read_head(VmaReader *vmar, Error **errp)
{
assert(vmar);
assert(errp);
assert(*errp == NULL);
unsigned char md5sum[16];
int i;
int ret = 0;
vmar->head_data = g_malloc(sizeof(VmaHeader));
if (full_read(vmar->fd, vmar->head_data, sizeof(VmaHeader)) !=
sizeof(VmaHeader)) {
error_setg(errp, "can't read vma header - %s",
errno ? g_strerror(errno) : "got EOF");
return -1;
}
VmaHeader *h = (VmaHeader *)vmar->head_data;
if (h->magic != VMA_MAGIC) {
error_setg(errp, "not a vma file - wrong magic number");
return -1;
}
uint32_t header_size = GUINT32_FROM_BE(h->header_size);
int need = header_size - sizeof(VmaHeader);
if (need <= 0) {
error_setg(errp, "wrong vma header size %d", header_size);
return -1;
}
vmar->head_data = g_realloc(vmar->head_data, header_size);
h = (VmaHeader *)vmar->head_data;
if (full_read(vmar->fd, vmar->head_data + sizeof(VmaHeader), need) !=
need) {
error_setg(errp, "can't read vma header data - %s",
errno ? g_strerror(errno) : "got EOF");
return -1;
}
memcpy(md5sum, h->md5sum, 16);
memset(h->md5sum, 0, 16);
g_checksum_reset(vmar->md5csum);
g_checksum_update(vmar->md5csum, vmar->head_data, header_size);
gsize csize = 16;
g_checksum_get_digest(vmar->md5csum, (guint8 *)(h->md5sum), &csize);
if (memcmp(md5sum, h->md5sum, 16) != 0) {
error_setg(errp, "wrong vma header chechsum");
return -1;
}
/* we can modify header data after checksum verify */
h->header_size = header_size;
h->version = GUINT32_FROM_BE(h->version);
if (h->version != 1) {
error_setg(errp, "wrong vma version %d", h->version);
return -1;
}
h->ctime = GUINT64_FROM_BE(h->ctime);
h->blob_buffer_offset = GUINT32_FROM_BE(h->blob_buffer_offset);
h->blob_buffer_size = GUINT32_FROM_BE(h->blob_buffer_size);
uint32_t bstart = h->blob_buffer_offset + 1;
uint32_t bend = h->blob_buffer_offset + h->blob_buffer_size;
if (bstart <= sizeof(VmaHeader)) {
error_setg(errp, "wrong vma blob buffer offset %d",
h->blob_buffer_offset);
return -1;
}
if (bend > header_size) {
error_setg(errp, "wrong vma blob buffer size %d/%d",
h->blob_buffer_offset, h->blob_buffer_size);
return -1;
}
while ((bstart + 2) <= bend) {
uint32_t size = vmar->head_data[bstart] +
(vmar->head_data[bstart+1] << 8);
if ((bstart + size + 2) <= bend) {
VmaBlob *blob = g_new0(VmaBlob, 1);
blob->start = bstart - h->blob_buffer_offset;
blob->len = size;
blob->data = vmar->head_data + bstart + 2;
g_hash_table_insert(vmar->blob_hash, &blob->start, blob);
}
bstart += size + 2;
}
int count = 0;
for (i = 1; i < 256; i++) {
VmaDeviceInfoHeader *dih = &h->dev_info[i];
uint32_t devname_ptr = GUINT32_FROM_BE(dih->devname_ptr);
uint64_t size = GUINT64_FROM_BE(dih->size);
const char *devname = get_header_str(vmar, devname_ptr);
if (size && devname) {
count++;
vmar->devinfo[i].size = size;
vmar->devinfo[i].devname = devname;
if (strcmp(devname, "vmstate") == 0) {
vmar->vmstate_stream = i;
}
}
}
if (!count) {
error_setg(errp, "vma does not contain data");
return -1;
}
for (i = 0; i < VMA_MAX_CONFIGS; i++) {
uint32_t name_ptr = GUINT32_FROM_BE(h->config_names[i]);
uint32_t data_ptr = GUINT32_FROM_BE(h->config_data[i]);
if (!(name_ptr && data_ptr)) {
continue;
}
const char *name = get_header_str(vmar, name_ptr);
const VmaBlob *blob = get_header_blob(vmar, data_ptr);
if (!(name && blob)) {
error_setg(errp, "vma contains invalid data pointers");
return -1;
}
VmaConfigData *cdata = g_new0(VmaConfigData, 1);
cdata->name = name;
cdata->data = blob->data;
cdata->len = blob->len;
vmar->cdata_list = g_list_append(vmar->cdata_list, cdata);
}
return ret;
};
VmaReader *vma_reader_create(const char *filename, Error **errp)
{
assert(filename);
assert(errp);
VmaReader *vmar = g_new0(VmaReader, 1);
if (strcmp(filename, "-") == 0) {
vmar->fd = dup(0);
} else {
vmar->fd = open(filename, O_RDONLY);
}
if (vmar->fd < 0) {
error_setg(errp, "can't open file %s - %s\n", filename,
g_strerror(errno));
goto err;
}
vmar->md5csum = g_checksum_new(G_CHECKSUM_MD5);
if (!vmar->md5csum) {
error_setg(errp, "can't allocate cmsum\n");
goto err;
}
vmar->blob_hash = g_hash_table_new_full(g_int32_hash, g_int32_equal,
NULL, g_free);
if (vma_reader_read_head(vmar, errp) < 0) {
goto err;
}
return vmar;
err:
if (vmar) {
vma_reader_destroy(vmar);
}
return NULL;
}
VmaHeader *vma_reader_get_header(VmaReader *vmar)
{
assert(vmar);
assert(vmar->head_data);
return (VmaHeader *)(vmar->head_data);
}
GList *vma_reader_get_config_data(VmaReader *vmar)
{
assert(vmar);
assert(vmar->head_data);
return vmar->cdata_list;
}
VmaDeviceInfo *vma_reader_get_device_info(VmaReader *vmar, guint8 dev_id)
{
assert(vmar);
assert(dev_id);
if (vmar->devinfo[dev_id].size && vmar->devinfo[dev_id].devname) {
return &vmar->devinfo[dev_id];
}
return NULL;
}
static void allocate_rstate(VmaReader *vmar, guint8 dev_id,
BlockDriverState *bs, bool write_zeroes)
{
assert(vmar);
assert(dev_id);
vmar->rstate[dev_id].bs = bs;
vmar->rstate[dev_id].write_zeroes = write_zeroes;
int64_t size = vmar->devinfo[dev_id].size;
int64_t bitmap_size = (size/BDRV_SECTOR_SIZE) +
(VMA_CLUSTER_SIZE/BDRV_SECTOR_SIZE) * BITS_PER_LONG - 1;
bitmap_size /= (VMA_CLUSTER_SIZE/BDRV_SECTOR_SIZE) * BITS_PER_LONG;
vmar->rstate[dev_id].bitmap_size = bitmap_size;
vmar->rstate[dev_id].bitmap = g_new0(unsigned long, bitmap_size);
vmar->cluster_count += size/VMA_CLUSTER_SIZE;
}
int vma_reader_register_bs(VmaReader *vmar, guint8 dev_id, BlockDriverState *bs,
bool write_zeroes, Error **errp)
{
assert(vmar);
assert(bs != NULL);
assert(dev_id);
assert(vmar->rstate[dev_id].bs == NULL);
int64_t size = bdrv_getlength(bs);
int64_t size_diff = size - vmar->devinfo[dev_id].size;
/* storage types can have different size restrictions, so it
* is not always possible to create an image with exact size.
* So we tolerate a size difference up to 4MB.
*/
if ((size_diff < 0) || (size_diff > 4*1024*1024)) {
error_setg(errp, "vma_reader_register_bs for stream %s failed - "
"unexpected size %zd != %zd", vmar->devinfo[dev_id].devname,
size, vmar->devinfo[dev_id].size);
return -1;
}
allocate_rstate(vmar, dev_id, bs, write_zeroes);
return 0;
}
static ssize_t safe_write(int fd, void *buf, size_t count)
{
ssize_t n;
do {
n = write(fd, buf, count);
} while (n < 0 && errno == EINTR);
return n;
}
static size_t full_write(int fd, void *buf, size_t len)
{
ssize_t n;
size_t total;
total = 0;
while (len > 0) {
n = safe_write(fd, buf, len);
if (n < 0) {
return n;
}
buf += n;
total += n;
len -= n;
}
if (len) {
/* incomplete write ? */
return -1;
}
return total;
}
static int restore_write_data(VmaReader *vmar, guint8 dev_id,
BlockDriverState *bs, int vmstate_fd,
unsigned char *buf, int64_t sector_num,
int nb_sectors, Error **errp)
{
assert(vmar);
if (dev_id == vmar->vmstate_stream) {
if (vmstate_fd >= 0) {
int len = nb_sectors * BDRV_SECTOR_SIZE;
int res = full_write(vmstate_fd, buf, len);
if (res < 0) {
error_setg(errp, "write vmstate failed %d", res);
return -1;
}
}
} else {
int res = bdrv_write(bs, sector_num, buf, nb_sectors);
if (res < 0) {
error_setg(errp, "bdrv_write to %s failed (%d)",
bdrv_get_device_name(bs), res);
return -1;
}
}
return 0;
}
static int restore_extent(VmaReader *vmar, unsigned char *buf,
int extent_size, int vmstate_fd,
bool verbose, bool verify, Error **errp)
{
assert(vmar);
assert(buf);
VmaExtentHeader *ehead = (VmaExtentHeader *)buf;
int start = VMA_EXTENT_HEADER_SIZE;
int i;
for (i = 0; i < VMA_BLOCKS_PER_EXTENT; i++) {
uint64_t block_info = GUINT64_FROM_BE(ehead->blockinfo[i]);
uint64_t cluster_num = block_info & 0xffffffff;
uint8_t dev_id = (block_info >> 32) & 0xff;
uint16_t mask = block_info >> (32+16);
int64_t max_sector;
if (!dev_id) {
continue;
}
VmaRestoreState *rstate = &vmar->rstate[dev_id];
BlockDriverState *bs = NULL;
if (dev_id != vmar->vmstate_stream) {
bs = rstate->bs;
if (!verify && !bs) {
error_setg(errp, "got wrong dev id %d", dev_id);
return -1;
}
if (vma_reader_get_bitmap(rstate, cluster_num)) {
error_setg(errp, "found duplicated cluster %zd for stream %s",
cluster_num, vmar->devinfo[dev_id].devname);
return -1;
}
vma_reader_set_bitmap(rstate, cluster_num, 1);
max_sector = vmar->devinfo[dev_id].size/BDRV_SECTOR_SIZE;
} else {
max_sector = G_MAXINT64;
if (cluster_num != vmar->vmstate_clusters) {
error_setg(errp, "found out of order vmstate data");
return -1;
}
vmar->vmstate_clusters++;
}
vmar->clusters_read++;
if (verbose) {
time_t duration = time(NULL) - vmar->start_time;
int percent = (vmar->clusters_read*100)/vmar->cluster_count;
if (percent != vmar->clusters_read_per) {
printf("progress %d%% (read %zd bytes, duration %zd sec)\n",
percent, vmar->clusters_read*VMA_CLUSTER_SIZE,
duration);
fflush(stdout);
vmar->clusters_read_per = percent;
}
}
/* try to write whole clusters to speedup restore */
if (mask == 0xffff) {
if ((start + VMA_CLUSTER_SIZE) > extent_size) {
error_setg(errp, "short vma extent - too many blocks");
return -1;
}
int64_t sector_num = (cluster_num * VMA_CLUSTER_SIZE) /
BDRV_SECTOR_SIZE;
int64_t end_sector = sector_num +
VMA_CLUSTER_SIZE/BDRV_SECTOR_SIZE;
if (end_sector > max_sector) {
end_sector = max_sector;
}
if (end_sector <= sector_num) {
error_setg(errp, "got wrong block address - write bejond end");
return -1;
}
if (!verify) {
int nb_sectors = end_sector - sector_num;
if (restore_write_data(vmar, dev_id, bs, vmstate_fd,
buf + start, sector_num, nb_sectors,
errp) < 0) {
return -1;
}
}
start += VMA_CLUSTER_SIZE;
} else {
int j;
int bit = 1;
for (j = 0; j < 16; j++) {
int64_t sector_num = (cluster_num*VMA_CLUSTER_SIZE +
j*VMA_BLOCK_SIZE)/BDRV_SECTOR_SIZE;
int64_t end_sector = sector_num +
VMA_BLOCK_SIZE/BDRV_SECTOR_SIZE;
if (end_sector > max_sector) {
end_sector = max_sector;
}
if (mask & bit) {
if ((start + VMA_BLOCK_SIZE) > extent_size) {
error_setg(errp, "short vma extent - too many blocks");
return -1;
}
if (end_sector <= sector_num) {
error_setg(errp, "got wrong block address - "
"write bejond end");
return -1;
}
if (!verify) {
int nb_sectors = end_sector - sector_num;
if (restore_write_data(vmar, dev_id, bs, vmstate_fd,
buf + start, sector_num,
nb_sectors, errp) < 0) {
return -1;
}
}
start += VMA_BLOCK_SIZE;
} else {
if (end_sector > sector_num) {
/* Todo: use bdrv_co_write_zeroes (but that need to
* be run inside coroutine?)
*/
int nb_sectors = end_sector - sector_num;
int zero_size = BDRV_SECTOR_SIZE*nb_sectors;
vmar->zero_cluster_data += zero_size;
if (mask != 0) {
vmar->partial_zero_cluster_data += zero_size;
}
if (rstate->write_zeroes && !verify) {
if (restore_write_data(vmar, dev_id, bs, vmstate_fd,
zero_vma_block, sector_num,
nb_sectors, errp) < 0) {
return -1;
}
}
}
}
bit = bit << 1;
}
}
}
if (start != extent_size) {
error_setg(errp, "vma extent error - missing blocks");
return -1;
}
return 0;
}
static int vma_reader_restore_full(VmaReader *vmar, int vmstate_fd,
bool verbose, bool verify,
Error **errp)
{
assert(vmar);
assert(vmar->head_data);
int ret = 0;
unsigned char buf[VMA_MAX_EXTENT_SIZE];
int buf_pos = 0;
unsigned char md5sum[16];
VmaHeader *h = (VmaHeader *)vmar->head_data;
vmar->start_time = time(NULL);
while (1) {
int bytes = full_read(vmar->fd, buf + buf_pos, sizeof(buf) - buf_pos);
if (bytes < 0) {
error_setg(errp, "read failed - %s", g_strerror(errno));
return -1;
}
buf_pos += bytes;
if (!buf_pos) {
break; /* EOF */
}
if (buf_pos < VMA_EXTENT_HEADER_SIZE) {
error_setg(errp, "read short extent (%d bytes)", buf_pos);
return -1;
}
VmaExtentHeader *ehead = (VmaExtentHeader *)buf;
/* extract md5sum */
memcpy(md5sum, ehead->md5sum, sizeof(ehead->md5sum));
memset(ehead->md5sum, 0, sizeof(ehead->md5sum));
g_checksum_reset(vmar->md5csum);
g_checksum_update(vmar->md5csum, buf, VMA_EXTENT_HEADER_SIZE);
gsize csize = 16;
g_checksum_get_digest(vmar->md5csum, ehead->md5sum, &csize);
if (memcmp(md5sum, ehead->md5sum, 16) != 0) {
error_setg(errp, "wrong vma extent header chechsum");
return -1;
}
if (memcmp(h->uuid, ehead->uuid, sizeof(ehead->uuid)) != 0) {
error_setg(errp, "wrong vma extent uuid");
return -1;
}
if (ehead->magic != VMA_EXTENT_MAGIC || ehead->reserved1 != 0) {
error_setg(errp, "wrong vma extent header magic");
return -1;
}
int block_count = GUINT16_FROM_BE(ehead->block_count);
int extent_size = VMA_EXTENT_HEADER_SIZE + block_count*VMA_BLOCK_SIZE;
if (buf_pos < extent_size) {
error_setg(errp, "short vma extent (%d < %d)", buf_pos,
extent_size);
return -1;
}
if (restore_extent(vmar, buf, extent_size, vmstate_fd, verbose,
verify, errp) < 0) {
return -1;
}
if (buf_pos > extent_size) {
memmove(buf, buf + extent_size, buf_pos - extent_size);
buf_pos = buf_pos - extent_size;
} else {
buf_pos = 0;
}
}
bdrv_drain_all();
int i;
for (i = 1; i < 256; i++) {
VmaRestoreState *rstate = &vmar->rstate[i];
if (!rstate->bs) {
continue;
}
if (bdrv_flush(rstate->bs) < 0) {
error_setg(errp, "vma bdrv_flush %s failed",
vmar->devinfo[i].devname);
return -1;
}
if (vmar->devinfo[i].size &&
(strcmp(vmar->devinfo[i].devname, "vmstate") != 0)) {
assert(rstate->bitmap);
int64_t cluster_num, end;
end = (vmar->devinfo[i].size + VMA_CLUSTER_SIZE - 1) /
VMA_CLUSTER_SIZE;
for (cluster_num = 0; cluster_num < end; cluster_num++) {
if (!vma_reader_get_bitmap(rstate, cluster_num)) {
error_setg(errp, "detected missing cluster %zd "
"for stream %s", cluster_num,
vmar->devinfo[i].devname);
return -1;
}
}
}
}
if (verbose) {
printf("total bytes read %zd, sparse bytes %zd (%.3g%%)\n",
vmar->clusters_read*VMA_CLUSTER_SIZE,
vmar->zero_cluster_data,
(double)(100.0*vmar->zero_cluster_data)/
(vmar->clusters_read*VMA_CLUSTER_SIZE));
printf("space reduction due to 4K zero bocks %.3g%%\n",
(double)(100.0*vmar->partial_zero_cluster_data) /
(vmar->clusters_read*VMA_CLUSTER_SIZE-vmar->zero_cluster_data));
}
return ret;
}
int vma_reader_restore(VmaReader *vmar, int vmstate_fd, bool verbose,
Error **errp)
{
return vma_reader_restore_full(vmar, vmstate_fd, verbose, false, errp);
}
int vma_reader_verify(VmaReader *vmar, bool verbose, Error **errp)
{
guint8 dev_id;
for (dev_id = 1; dev_id < 255; dev_id++) {
if (vma_reader_get_device_info(vmar, dev_id)) {
allocate_rstate(vmar, dev_id, NULL, false);
}
}
return vma_reader_restore_full(vmar, -1, verbose, true, errp);
}