commit 940fe79cafbf9e8d262af3bc0db56269c2bf74ec Author: Jan Wolff Date: Fri Jun 21 09:35:42 2019 +0200 initial commit diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..6ed8478 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,18 @@ +Copyright (c) 2019 Jan Wolff + +This software is provided 'as-is', without any express or implied +warranty. In no event will the authors be held liable for any damages +arising from the use of this software. + +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it +freely, subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. + diff --git a/README.md b/README.md new file mode 100644 index 0000000..962f5d5 --- /dev/null +++ b/README.md @@ -0,0 +1,15 @@ +VMA extractor +============= + +`vma.py` implements a VMA extraction tool in Python 3. + +Usage: +```sh +./vma.py path/to/source.vma path/to/target/directory +``` + +I think it is pretty important to be able to read Proxmox backups outside of a +Proxmox environment. Yet, porting their VMA implementation to a standalone +tool proved difficult. VMA-Reader and VMA-Writer are implemented as patches to +the Proxmox-patched version and Qemu and are thus very difficult to compile on +non-Proxmox systems. diff --git a/vma.py b/vma.py new file mode 100755 index 0000000..820323c --- /dev/null +++ b/vma.py @@ -0,0 +1,303 @@ +#!/usr/bin/env python3 +import os +import sys +import hashlib +import struct +import argparse + + +class VmaHeader(): + def __init__(self, fo): + # 0 - 3: magic + # VMA magic string ("VMA\x00") + magic = fo.read(4) + assert magic == b'VMA\0' + + # 4 - 7: version + # Version number (valid value is 1) + version = int.from_bytes(fo.read(4), 'big') + assert version == 1 + + # 8 - 23: uuid + # Unique ID, Same uuid is used to mark extents. + self.uuid = fo.read(16) + + # 24 - 31: ctime + # Backup time stamp (seconds since epoch) + self.ctime = int.from_bytes(fo.read(8), 'big') + + # 32 - 47: md5sum + # Header checksum (from byte 0 to header_size). This field + # is filled with zero to generate the checksum. + self.md5sum = fo.read(16) + + # 48 - 51: blob_buffer_offset + # Start of blob buffer (multiple of 512) + self.blob_buffer_offset = int.from_bytes(fo.read(4), 'big') + + # 52 - 55: blob_buffer_size + # Size of blob buffer (multiple of 512) + self.blob_buffer_size = int.from_bytes(fo.read(4), 'big') + + # 56 - 59: header_size + # Overall size of this header (multiple of 512) + self.header_size = int.from_bytes(fo.read(4), 'big') + + # 60 - 2043: reserved + fo.seek(1984, os.SEEK_CUR) + + # 2044 - 3067: uint32_t config_names[256] + # Offsets into blob_buffer table + self.config_names = [] + for i in range(256): + self.config_names.append(int.from_bytes(fo.read(4), 'big')) + + # 3068 - 4091: uint32_t config_data[256] + # Offsets into blob_buffer table + self.config_data = [] + for i in range(256): + self.config_data.append(int.from_bytes(fo.read(4), 'big')) + + # 4092 - 4095: reserved + fo.seek(4, os.SEEK_CUR) + + # 4096 - 12287: VmaDeviceInfoHeader dev_info[256] + # The offset in this table is used as 'dev_id' inside + # the data streams. + self.dev_info = [] + for i in range(256): + self.dev_info.append(VmaDeviceInfoHeader(fo, self)) + + # 12288 - header_size: Blob buffer + + # the blob buffer layout is very odd. there appears to be an additional + # byte of padding at the beginning + fo.seek(1, os.SEEK_CUR) + # since byte-wise offsets are used to address the blob buffer, the + # blob metadata is stored in a hashmap, with the offsets as the keys + self.blob_buffer = {} + blob_buffer_current_offset = 1 + while(fo.tell() < self.blob_buffer_offset + self.blob_buffer_size): + self.blob_buffer[blob_buffer_current_offset] = Blob(fo) + blob_buffer_current_offset = fo.tell() - self.blob_buffer_offset + + # make sure the file object points at the end of the vma header + fo.seek(self.header_size, os.SEEK_SET) + + +class VmaDeviceInfoHeader(): + def __init__(self, fo, vma_header): + self.__vma_header = vma_header + + # 0 - 3: devive name (offsets into blob_buffer table) + self.device_name = int.from_bytes(fo.read(4), 'big') + + # 4 - 7: reserved + fo.seek(4, os.SEEK_CUR) + + # 8 - 15: device size in bytes + self.device_size = int.from_bytes(fo.read(8), 'big') + + # 16 - 31: reserved + fo.seek(16, os.SEEK_CUR) + + + def get_name(self): + name = self.__vma_header.blob_buffer[self.device_name].data + return name.split(b'\0')[0].decode('utf-8') + + +class VmaExtentHeader(): + def __init__(self, fo, vma_header): + # 0 - 3: magic + # VMA extent magic string ("VMAE") + magic = fo.read(4) + assert magic == b'VMAE' + + # 4 - 5: reserved + fo.seek(2, os.SEEK_CUR) + + # 6 - 7: block_count + # Overall number of contained 4K block + self.block_count = int.from_bytes(fo.read(2), 'big') + + # 8 - 23: uuid + # Unique ID, Same uuid as used in the VMA header. + self.uuid = fo.read(16) + + # 24 - 39: md5sum + # Header checksum (from byte 0 to header_size). This field + # is filled with zero to generate the checksum. + self.md5sum = fo.read(16) + + # 40 - 511: blockinfo[59] + self.blockinfo = [] + for i in range(59): + self.blockinfo.append(Blockinfo(fo, vma_header)) + + +class Blob(): + def __init__(self, fo): + # the size of a blob is a two-byte int in LITTLE endian + # source: original c code of vma-reader + # uint32_t size = vmar->head_data[bstart] + + # (vmar->head_data[bstart+1] << 8); + self.size = int.from_bytes(fo.read(2), 'little') + self.data = fo.read(self.size) + + +class Blockinfo(): + CLUSTER_SIZE = 65536 + + def __init__(self, fo, vma_header): + self.__vma_header = vma_header + + # 0 - 1: mask + self.mask = int.from_bytes(fo.read(2), 'big') + + # 2: reserved + fo.seek(1, os.SEEK_CUR) + + # 3: dev_id + # Device ID (offset into dev_info table) + self.dev_id = int.from_bytes(fo.read(1), 'big') + + # 4 - 7: cluster_num + self.cluster_num = int.from_bytes(fo.read(4), 'big') + + +def extract_configs(fo, args, vma_header): + """ + Configs in VMA are composed of two blobs. One specifies the config's + filename and the other contains the config's content. + The filename seems to be a null-terminated string, while the content is not + terminated. + """ + + if args.verbose: print('extracting configs...') + + for i in range(256): + if vma_header.config_names[i] == 0: continue + config_name = vma_header.blob_buffer[vma_header.config_names[i]].data + # interpret filename as a null-terminated utf-8 string + config_name = config_name.split(b'\0')[0].decode('utf-8') + + if args.verbose: print(f'{config_name}...', end='') + + config_data = vma_header.blob_buffer[vma_header.config_data[i]].data + + with open(os.path.join(args.destination, config_name), 'wb') as config_fo: + config_fo.write(config_data) + + if args.verbose: print(' OK') + + +def extract(fo, args): + os.makedirs(args.destination, exist_ok=True) + + fo.seek(0, os.SEEK_END) + filesize = fo.tell() + fo.seek(0, os.SEEK_SET) + + vma_header = VmaHeader(fo) + + extract_configs(fo, args, vma_header) + + # extract_configs may move the read head somewhere into the blob buffer + # make sure we are back at the end of the header + fo.seek(vma_header.header_size, os.SEEK_SET) + + if args.verbose: print('extracting devices...') + + # open file handlers for all devices within the VMA + # so we can easily append data to arbitrary devices + device_fos = {} + for dev_id, dev_info in enumerate(vma_header.dev_info): + if dev_info.device_size > 0: + if args.verbose: print(dev_info.get_name()) + device_fos[dev_id] = open(os.path.join(args.destination, dev_info.get_name()), 'wb') + + if args.verbose: print('this may take a while...') + + # used for sanity checking + cluster_num_prev = -1 + + while(fo.tell() < filesize): + # when there is data to read at this point, we can safely expect a full + # extent header with additional clusters + extent_header = VmaExtentHeader(fo, vma_header) + assert vma_header.uuid == extent_header.uuid + + for blockinfo in extent_header.blockinfo: + if blockinfo.dev_id == 0: continue + + device_fo = device_fos[blockinfo.dev_id] + + # non-sequential clusters encountered, handle this case + if blockinfo.cluster_num != cluster_num_prev + 1: + if args.verbose: print('non sequential cluster encountered...') + + cluster_pos = blockinfo.cluster_num * Blockinfo.CLUSTER_SIZE + if blockinfo.cluster_num > cluster_num_prev: + # special case: cluster num is larger than current, + # seek forward into file AND, if needed, fill missing size + # with zeros + device_fo.seek(0, os.SEEK_END) + written_size = device_fo.tell() + + if written_size < cluster_pos: + # add padding for missing clusters + if args.verbose: + print(f'{blockinfo.cluster_num}') + print(f'adding {cluster_pos - written_size} bytes' + + 'of padding...') + + # write padding in chucks of 4096 bytes to avoid + # memory errors + padding = cluster_pos - written_size + while padding > 0: + device_fo.write(b'\0' * min(padding, 4096)) + padding -= 4096 + + # seek to start of new cluster + device_fo.seek(cluster_pos, os.SEEK_SET) + + cluster_num_prev = blockinfo.cluster_num + + for i in range(16): + # a 2-bytes wide bitmask indicates 4k blocks with only zeros + if (1 << i) & blockinfo.mask: + device_fo.write(fo.read(4096)) + else: + device_fo.write(b'\0' * 4096) + + if args.verbose: print('closing file handles...') + for device_fo in device_fos.values(): + device_fo.close() + + if args.verbose: print('done') + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('filename', type=str) + parser.add_argument('destination', type=str) + parser.add_argument('-v', '--verbose', default=False, action='store_true') + parser.add_argument('-f', '--force', default=False, action='store_true') + args = parser.parse_args() + + if(not os.path.exists(args.filename)): + print('Error! Source file does not exist!') + return 1 + + if(os.path.exists(args.destination) and not args.force): + print('Error! Destination path exists!') + return 1 + + with open(args.filename, 'rb') as fo: + extract(fo, args) + + return 0 + +if __name__ == '__main__': + sys.exit(main())