initial commit

2019-06-21 09:35:42 +02:00 · 2019-06-21 09:35:42 +02:00 · 940fe79caf
commit 940fe79caf
3 changed files with 336 additions and 0 deletions
--- a/LICENSE.md
+++ b/LICENSE.md
@ -0,0 +1,18 @@
 Copyright (c) 2019 Jan Wolff
 This software is provided 'as-is', without any express or implied
 warranty. In no event will the authors be held liable for any damages
 arising from the use of this software.
 Permission is granted to anyone to use this software for any purpose,
 including commercial applications, and to alter it and redistribute it
 freely, subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not
   claim that you wrote the original software. If you use this software
   in a product, an acknowledgment in the product documentation would be
   appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be
   misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
--- a/README.md
+++ b/README.md
@ -0,0 +1,15 @@
 VMA extractor
 =============
 `vma.py` implements a VMA extraction tool in Python 3.
 Usage:
 ```sh
 ./vma.py path/to/source.vma path/to/target/directory
 ```
 I think it is pretty important to be able to read Proxmox backups outside of a
 Proxmox environment. Yet, porting their VMA implementation to a standalone
 tool proved difficult. VMA-Reader and VMA-Writer are implemented as patches to
 the Proxmox-patched version and Qemu and are thus very difficult to compile on
 non-Proxmox systems.
--- a/vma.py
+++ b/vma.py
@ -0,0 +1,303 @@
 #!/usr/bin/env python3
 import os
 import sys
 import hashlib
 import struct
 import argparse
 class VmaHeader():
    def __init__(self, fo):
        # 0 -  3:   magic
        #     VMA magic string ("VMA\x00")
        magic = fo.read(4)
        assert magic == b'VMA\0'
        # 4 -  7:   version
        #     Version number (valid value is 1)
        version = int.from_bytes(fo.read(4), 'big')
        assert version == 1
        # 8 - 23:   uuid
        #     Unique ID, Same uuid is used to mark extents.
        self.uuid = fo.read(16)
        # 24 - 31:   ctime
        #     Backup time stamp (seconds since epoch)
        self.ctime = int.from_bytes(fo.read(8), 'big')
        # 32 - 47:   md5sum
        #     Header checksum (from byte 0 to header_size). This field
        #     is filled with zero to generate the checksum.
        self.md5sum = fo.read(16)
        # 48 - 51:   blob_buffer_offset
        #     Start of blob buffer (multiple of 512)
        self.blob_buffer_offset = int.from_bytes(fo.read(4), 'big')
        # 52 - 55:   blob_buffer_size
        #     Size of blob buffer (multiple of 512)
        self.blob_buffer_size = int.from_bytes(fo.read(4), 'big')
        # 56 - 59:   header_size
        #     Overall size of this header (multiple of 512)
        self.header_size = int.from_bytes(fo.read(4), 'big')
        # 60 - 2043: reserved
        fo.seek(1984, os.SEEK_CUR)
        # 2044 - 3067: uint32_t config_names[256]
        #     Offsets into blob_buffer table
        self.config_names = []
        for i in range(256):
            self.config_names.append(int.from_bytes(fo.read(4), 'big'))
        # 3068 - 4091: uint32_t config_data[256]
        #     Offsets into blob_buffer table
        self.config_data = []
        for i in range(256):
            self.config_data.append(int.from_bytes(fo.read(4), 'big'))
        # 4092 - 4095: reserved
        fo.seek(4, os.SEEK_CUR)
        # 4096 - 12287: VmaDeviceInfoHeader dev_info[256]
        #     The offset in this table is used as 'dev_id' inside
        #     the data streams.
        self.dev_info = []
        for i in range(256):
            self.dev_info.append(VmaDeviceInfoHeader(fo, self))
        # 12288 - header_size: Blob buffer
        # the blob buffer layout is very odd. there appears to be an additional
        # byte of padding at the beginning
        fo.seek(1, os.SEEK_CUR)
        # since byte-wise offsets are used to address the blob buffer, the
        # blob metadata is stored in a hashmap, with the offsets as the keys
        self.blob_buffer = {}
        blob_buffer_current_offset = 1
        while(fo.tell() < self.blob_buffer_offset + self.blob_buffer_size):
            self.blob_buffer[blob_buffer_current_offset] = Blob(fo)
            blob_buffer_current_offset = fo.tell() - self.blob_buffer_offset
        # make sure the file object points at the end of the vma header
        fo.seek(self.header_size, os.SEEK_SET)
 class VmaDeviceInfoHeader():
    def __init__(self, fo, vma_header):
        self.__vma_header = vma_header
        # 0 -  3:   devive name (offsets into blob_buffer table)
        self.device_name = int.from_bytes(fo.read(4), 'big')
        # 4 -  7:   reserved
        fo.seek(4, os.SEEK_CUR)
        # 8 - 15:   device size in bytes
        self.device_size = int.from_bytes(fo.read(8), 'big')
        # 16 - 31:   reserved
        fo.seek(16, os.SEEK_CUR)
    def get_name(self):
        name = self.__vma_header.blob_buffer[self.device_name].data
        return name.split(b'\0')[0].decode('utf-8')
 class VmaExtentHeader():
    def __init__(self, fo, vma_header):
        # 0 -  3:   magic
        #     VMA extent magic string ("VMAE")
        magic = fo.read(4)
        assert magic == b'VMAE'
        # 4 -  5:   reserved
        fo.seek(2, os.SEEK_CUR)
        # 6 -  7:   block_count
        #     Overall number of contained 4K block
        self.block_count = int.from_bytes(fo.read(2), 'big')
        # 8 - 23:   uuid
        #     Unique ID, Same uuid as used in the VMA header.
        self.uuid = fo.read(16)
        # 24 - 39:   md5sum
        #     Header checksum (from byte 0 to header_size). This field
        #     is filled with zero to generate the checksum.
        self.md5sum = fo.read(16)
        # 40 - 511:   blockinfo[59]
        self.blockinfo = []
        for i in range(59):
            self.blockinfo.append(Blockinfo(fo, vma_header))
 class Blob():
    def __init__(self, fo):
        # the size of a blob is a two-byte int in LITTLE endian
        # source: original c code of vma-reader
        #    uint32_t size = vmar->head_data[bstart] +
        #        (vmar->head_data[bstart+1] << 8);
        self.size = int.from_bytes(fo.read(2), 'little')
        self.data = fo.read(self.size)
 class Blockinfo():
    CLUSTER_SIZE = 65536
    def __init__(self, fo, vma_header):
        self.__vma_header = vma_header
        # 0 - 1:   mask
        self.mask = int.from_bytes(fo.read(2), 'big')
        # 2:   reserved
        fo.seek(1, os.SEEK_CUR)
        # 3:   dev_id
        #    Device ID (offset into dev_info table)
        self.dev_id = int.from_bytes(fo.read(1), 'big')
        # 4 - 7:   cluster_num
        self.cluster_num = int.from_bytes(fo.read(4), 'big')
 def extract_configs(fo, args, vma_header):
    """
    Configs in VMA are composed of two blobs. One specifies the config's
    filename and the other contains the config's content.
    The filename seems to be a null-terminated string, while the content is not
    terminated.
    """
    if args.verbose: print('extracting configs...')
    for i in range(256):
        if vma_header.config_names[i] == 0: continue
        config_name = vma_header.blob_buffer[vma_header.config_names[i]].data
        # interpret filename as a null-terminated utf-8 string
        config_name = config_name.split(b'\0')[0].decode('utf-8')
        if args.verbose: print(f'{config_name}...', end='')
        config_data = vma_header.blob_buffer[vma_header.config_data[i]].data
        with open(os.path.join(args.destination, config_name), 'wb') as config_fo:
            config_fo.write(config_data)
        if args.verbose: print(' OK')
 def extract(fo, args):
    os.makedirs(args.destination, exist_ok=True)
    fo.seek(0, os.SEEK_END)
    filesize = fo.tell()
    fo.seek(0, os.SEEK_SET)
    vma_header = VmaHeader(fo)
    extract_configs(fo, args, vma_header)
    # extract_configs may move the read head somewhere into the blob buffer
    # make sure we are back at the end of the header
    fo.seek(vma_header.header_size, os.SEEK_SET)
    if args.verbose: print('extracting devices...')
    # open file handlers for all devices within the VMA
    # so we can easily append data to arbitrary devices
    device_fos = {}
    for dev_id, dev_info in enumerate(vma_header.dev_info):
        if dev_info.device_size > 0:
            if args.verbose: print(dev_info.get_name())
            device_fos[dev_id] = open(os.path.join(args.destination, dev_info.get_name()), 'wb')
    if args.verbose: print('this may take a while...')
    # used for sanity checking
    cluster_num_prev = -1
    while(fo.tell() < filesize):
        # when there is data to read at this point, we can safely expect a full
        # extent header with additional clusters
        extent_header = VmaExtentHeader(fo, vma_header)
        assert vma_header.uuid == extent_header.uuid
        for blockinfo in extent_header.blockinfo:
            if blockinfo.dev_id == 0: continue
            device_fo = device_fos[blockinfo.dev_id]
            # non-sequential clusters encountered, handle this case
            if blockinfo.cluster_num != cluster_num_prev + 1:
                if args.verbose: print('non sequential cluster encountered...')
                cluster_pos = blockinfo.cluster_num * Blockinfo.CLUSTER_SIZE
                if blockinfo.cluster_num > cluster_num_prev:
                    # special case: cluster num is larger than current,
                    # seek forward into file AND, if needed, fill missing size
                    # with zeros
                    device_fo.seek(0, os.SEEK_END)
                    written_size = device_fo.tell()
                    if written_size < cluster_pos:
                        # add padding for missing clusters
                        if args.verbose:
                            print(f'{blockinfo.cluster_num}')
                            print(f'adding {cluster_pos - written_size} bytes'
                                 + 'of padding...')
                        # write padding in chucks of 4096 bytes to avoid
                        # memory errors
                        padding = cluster_pos - written_size
                        while padding > 0:
                            device_fo.write(b'\0' * min(padding, 4096))
                            padding -= 4096
                # seek to start of new cluster
                device_fo.seek(cluster_pos, os.SEEK_SET)
            cluster_num_prev = blockinfo.cluster_num
            for i in range(16):
                # a 2-bytes wide bitmask indicates 4k blocks with only zeros
                if (1 << i) & blockinfo.mask:
                    device_fo.write(fo.read(4096))
                else:
                    device_fo.write(b'\0' * 4096)
    if args.verbose: print('closing file handles...')
    for device_fo in device_fos.values():
        device_fo.close()
    if args.verbose: print('done')
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('filename', type=str)
    parser.add_argument('destination', type=str)
    parser.add_argument('-v', '--verbose', default=False, action='store_true')
    parser.add_argument('-f', '--force', default=False, action='store_true')
    args = parser.parse_args()
    if(not os.path.exists(args.filename)):
        print('Error! Source file does not exist!')
        return 1
    if(os.path.exists(args.destination) and not args.force):
        print('Error! Destination path exists!')
        return 1
    with open(args.filename, 'rb') as fo:
        extract(fo, args)
    return 0
 if __name__ == '__main__':
    sys.exit(main())