From 940fe79cafbf9e8d262af3bc0db56269c2bf74ec Mon Sep 17 00:00:00 2001
From: Jan Wolff <janw@mailbox.org>
Date: Fri, 21 Jun 2019 09:35:42 +0200
Subject: [PATCH] initial commit

---
 LICENSE.md |  18 ++++
 README.md  |  15 +++
 vma.py     | 303 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 336 insertions(+)
 create mode 100644 LICENSE.md
 create mode 100644 README.md
 create mode 100755 vma.py

diff --git a/LICENSE.md b/LICENSE.md
new file mode 100644
index 0000000..6ed8478
--- /dev/null
+++ b/LICENSE.md
@@ -0,0 +1,18 @@
+Copyright (c) 2019 Jan Wolff
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+   claim that you wrote the original software. If you use this software
+   in a product, an acknowledgment in the product documentation would be
+   appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be
+   misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..962f5d5
--- /dev/null
+++ b/README.md
@@ -0,0 +1,15 @@
+VMA extractor
+=============
+
+`vma.py` implements a VMA extraction tool in Python 3.
+
+Usage:
+```sh
+./vma.py path/to/source.vma path/to/target/directory
+```
+
+I think it is pretty important to be able to read Proxmox backups outside of a
+Proxmox environment. Yet, porting their VMA implementation to a standalone
+tool proved difficult. VMA-Reader and VMA-Writer are implemented as patches to
+the Proxmox-patched version and Qemu and are thus very difficult to compile on
+non-Proxmox systems.
diff --git a/vma.py b/vma.py
new file mode 100755
index 0000000..820323c
--- /dev/null
+++ b/vma.py
@@ -0,0 +1,303 @@
+#!/usr/bin/env python3
+import os
+import sys
+import hashlib
+import struct
+import argparse
+
+
+class VmaHeader():
+    def __init__(self, fo):
+        # 0 -  3:   magic
+        #     VMA magic string ("VMA\x00")
+        magic = fo.read(4)
+        assert magic == b'VMA\0'
+
+        # 4 -  7:   version
+        #     Version number (valid value is 1)
+        version = int.from_bytes(fo.read(4), 'big')
+        assert version == 1
+
+        # 8 - 23:   uuid
+        #     Unique ID, Same uuid is used to mark extents.
+        self.uuid = fo.read(16)
+
+        # 24 - 31:   ctime
+        #     Backup time stamp (seconds since epoch)
+        self.ctime = int.from_bytes(fo.read(8), 'big')
+
+        # 32 - 47:   md5sum
+        #     Header checksum (from byte 0 to header_size). This field
+        #     is filled with zero to generate the checksum.
+        self.md5sum = fo.read(16)
+
+        # 48 - 51:   blob_buffer_offset
+        #     Start of blob buffer (multiple of 512)
+        self.blob_buffer_offset = int.from_bytes(fo.read(4), 'big')
+
+        # 52 - 55:   blob_buffer_size
+        #     Size of blob buffer (multiple of 512)
+        self.blob_buffer_size = int.from_bytes(fo.read(4), 'big')
+
+        # 56 - 59:   header_size
+        #     Overall size of this header (multiple of 512)
+        self.header_size = int.from_bytes(fo.read(4), 'big')
+
+        # 60 - 2043: reserved
+        fo.seek(1984, os.SEEK_CUR)
+
+        # 2044 - 3067: uint32_t config_names[256]
+        #     Offsets into blob_buffer table
+        self.config_names = []
+        for i in range(256):
+            self.config_names.append(int.from_bytes(fo.read(4), 'big'))
+
+        # 3068 - 4091: uint32_t config_data[256]
+        #     Offsets into blob_buffer table
+        self.config_data = []
+        for i in range(256):
+            self.config_data.append(int.from_bytes(fo.read(4), 'big'))
+
+        # 4092 - 4095: reserved
+        fo.seek(4, os.SEEK_CUR)
+
+        # 4096 - 12287: VmaDeviceInfoHeader dev_info[256]
+        #     The offset in this table is used as 'dev_id' inside
+        #     the data streams.
+        self.dev_info = []
+        for i in range(256):
+            self.dev_info.append(VmaDeviceInfoHeader(fo, self))
+
+        # 12288 - header_size: Blob buffer
+
+        # the blob buffer layout is very odd. there appears to be an additional
+        # byte of padding at the beginning
+        fo.seek(1, os.SEEK_CUR)
+        # since byte-wise offsets are used to address the blob buffer, the
+        # blob metadata is stored in a hashmap, with the offsets as the keys
+        self.blob_buffer = {}
+        blob_buffer_current_offset = 1
+        while(fo.tell() < self.blob_buffer_offset + self.blob_buffer_size):
+            self.blob_buffer[blob_buffer_current_offset] = Blob(fo)
+            blob_buffer_current_offset = fo.tell() - self.blob_buffer_offset
+
+        # make sure the file object points at the end of the vma header
+        fo.seek(self.header_size, os.SEEK_SET)
+
+
+class VmaDeviceInfoHeader():
+    def __init__(self, fo, vma_header):
+        self.__vma_header = vma_header
+
+        # 0 -  3:   devive name (offsets into blob_buffer table)
+        self.device_name = int.from_bytes(fo.read(4), 'big')
+
+        # 4 -  7:   reserved
+        fo.seek(4, os.SEEK_CUR)
+
+        # 8 - 15:   device size in bytes
+        self.device_size = int.from_bytes(fo.read(8), 'big')
+
+        # 16 - 31:   reserved
+        fo.seek(16, os.SEEK_CUR)
+
+
+    def get_name(self):
+        name = self.__vma_header.blob_buffer[self.device_name].data
+        return name.split(b'\0')[0].decode('utf-8')
+
+
+class VmaExtentHeader():
+    def __init__(self, fo, vma_header):
+        # 0 -  3:   magic
+        #     VMA extent magic string ("VMAE")
+        magic = fo.read(4)
+        assert magic == b'VMAE'
+
+        # 4 -  5:   reserved
+        fo.seek(2, os.SEEK_CUR)
+
+        # 6 -  7:   block_count
+        #     Overall number of contained 4K block
+        self.block_count = int.from_bytes(fo.read(2), 'big')
+
+        # 8 - 23:   uuid
+        #     Unique ID, Same uuid as used in the VMA header.
+        self.uuid = fo.read(16)
+
+        # 24 - 39:   md5sum
+        #     Header checksum (from byte 0 to header_size). This field
+        #     is filled with zero to generate the checksum.
+        self.md5sum = fo.read(16)
+
+        # 40 - 511:   blockinfo[59]
+        self.blockinfo = []
+        for i in range(59):
+            self.blockinfo.append(Blockinfo(fo, vma_header))
+
+
+class Blob():
+    def __init__(self, fo):
+        # the size of a blob is a two-byte int in LITTLE endian
+        # source: original c code of vma-reader
+        #    uint32_t size = vmar->head_data[bstart] +
+        #        (vmar->head_data[bstart+1] << 8);
+        self.size = int.from_bytes(fo.read(2), 'little')
+        self.data = fo.read(self.size)
+
+
+class Blockinfo():
+    CLUSTER_SIZE = 65536
+
+    def __init__(self, fo, vma_header):
+        self.__vma_header = vma_header
+
+        # 0 - 1:   mask
+        self.mask = int.from_bytes(fo.read(2), 'big')
+
+        # 2:   reserved
+        fo.seek(1, os.SEEK_CUR)
+
+        # 3:   dev_id
+        #    Device ID (offset into dev_info table)
+        self.dev_id = int.from_bytes(fo.read(1), 'big')
+
+        # 4 - 7:   cluster_num
+        self.cluster_num = int.from_bytes(fo.read(4), 'big')
+
+
+def extract_configs(fo, args, vma_header):
+    """
+    Configs in VMA are composed of two blobs. One specifies the config's
+    filename and the other contains the config's content.
+    The filename seems to be a null-terminated string, while the content is not
+    terminated.
+    """
+
+    if args.verbose: print('extracting configs...')
+
+    for i in range(256):
+        if vma_header.config_names[i] == 0: continue
+        config_name = vma_header.blob_buffer[vma_header.config_names[i]].data
+        # interpret filename as a null-terminated utf-8 string
+        config_name = config_name.split(b'\0')[0].decode('utf-8')
+
+        if args.verbose: print(f'{config_name}...', end='')
+
+        config_data = vma_header.blob_buffer[vma_header.config_data[i]].data
+
+        with open(os.path.join(args.destination, config_name), 'wb') as config_fo:
+            config_fo.write(config_data)
+
+        if args.verbose: print(' OK')
+
+
+def extract(fo, args):
+    os.makedirs(args.destination, exist_ok=True)
+
+    fo.seek(0, os.SEEK_END)
+    filesize = fo.tell()
+    fo.seek(0, os.SEEK_SET)
+
+    vma_header = VmaHeader(fo)
+
+    extract_configs(fo, args, vma_header)
+
+    # extract_configs may move the read head somewhere into the blob buffer
+    # make sure we are back at the end of the header
+    fo.seek(vma_header.header_size, os.SEEK_SET)
+
+    if args.verbose: print('extracting devices...')
+
+    # open file handlers for all devices within the VMA
+    # so we can easily append data to arbitrary devices
+    device_fos = {}
+    for dev_id, dev_info in enumerate(vma_header.dev_info):
+        if dev_info.device_size > 0:
+            if args.verbose: print(dev_info.get_name())
+            device_fos[dev_id] = open(os.path.join(args.destination, dev_info.get_name()), 'wb')
+
+    if args.verbose: print('this may take a while...')
+
+    # used for sanity checking
+    cluster_num_prev = -1
+
+    while(fo.tell() < filesize):
+        # when there is data to read at this point, we can safely expect a full
+        # extent header with additional clusters
+        extent_header = VmaExtentHeader(fo, vma_header)
+        assert vma_header.uuid == extent_header.uuid
+
+        for blockinfo in extent_header.blockinfo:
+            if blockinfo.dev_id == 0: continue
+
+            device_fo = device_fos[blockinfo.dev_id]
+
+            # non-sequential clusters encountered, handle this case
+            if blockinfo.cluster_num != cluster_num_prev + 1:
+                if args.verbose: print('non sequential cluster encountered...')
+
+                cluster_pos = blockinfo.cluster_num * Blockinfo.CLUSTER_SIZE
+                if blockinfo.cluster_num > cluster_num_prev:
+                    # special case: cluster num is larger than current,
+                    # seek forward into file AND, if needed, fill missing size
+                    # with zeros
+                    device_fo.seek(0, os.SEEK_END)
+                    written_size = device_fo.tell()
+
+                    if written_size < cluster_pos:
+                        # add padding for missing clusters
+                        if args.verbose:
+                            print(f'{blockinfo.cluster_num}')
+                            print(f'adding {cluster_pos - written_size} bytes'
+                                 + 'of padding...')
+
+                        # write padding in chucks of 4096 bytes to avoid
+                        # memory errors
+                        padding = cluster_pos - written_size
+                        while padding > 0:
+                            device_fo.write(b'\0' * min(padding, 4096))
+                            padding -= 4096
+
+                # seek to start of new cluster
+                device_fo.seek(cluster_pos, os.SEEK_SET)
+
+            cluster_num_prev = blockinfo.cluster_num
+
+            for i in range(16):
+                # a 2-bytes wide bitmask indicates 4k blocks with only zeros
+                if (1 << i) & blockinfo.mask:
+                    device_fo.write(fo.read(4096))
+                else:
+                    device_fo.write(b'\0' * 4096)
+
+    if args.verbose: print('closing file handles...')
+    for device_fo in device_fos.values():
+        device_fo.close()
+
+    if args.verbose: print('done')
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('filename', type=str)
+    parser.add_argument('destination', type=str)
+    parser.add_argument('-v', '--verbose', default=False, action='store_true')
+    parser.add_argument('-f', '--force', default=False, action='store_true')
+    args = parser.parse_args()
+
+    if(not os.path.exists(args.filename)):
+        print('Error! Source file does not exist!')
+        return 1
+
+    if(os.path.exists(args.destination) and not args.force):
+        print('Error! Destination path exists!')
+        return 1
+
+    with open(args.filename, 'rb') as fo:
+        extract(fo, args)
+
+    return 0
+
+if __name__ == '__main__':
+    sys.exit(main())