vma-extractor/vma.py

#!/usr/bin/env python3
import os
import sys
import hashlib
import struct
import argparse


class VmaHeader():
    def __init__(self, fo):
        # 0 -  3:   magic
        #     VMA magic string ("VMA\x00")
        magic = fo.read(4)
        assert magic == b'VMA\0'

        # 4 -  7:   version
        #     Version number (valid value is 1)
        version = int.from_bytes(fo.read(4), 'big')
        assert version == 1

        # 8 - 23:   uuid
        #     Unique ID, Same uuid is used to mark extents.
        self.uuid = fo.read(16)

        # 24 - 31:   ctime
        #     Backup time stamp (seconds since epoch)
        self.ctime = int.from_bytes(fo.read(8), 'big')

        # 32 - 47:   md5sum
        #     Header checksum (from byte 0 to header_size). This field
        #     is filled with zero to generate the checksum.
        self.md5sum = fo.read(16)

        # 48 - 51:   blob_buffer_offset
        #     Start of blob buffer (multiple of 512)
        self.blob_buffer_offset = int.from_bytes(fo.read(4), 'big')

        # 52 - 55:   blob_buffer_size
        #     Size of blob buffer (multiple of 512)
        self.blob_buffer_size = int.from_bytes(fo.read(4), 'big')

        # 56 - 59:   header_size
        #     Overall size of this header (multiple of 512)
        self.header_size = int.from_bytes(fo.read(4), 'big')

        # 60 - 2043: reserved
        fo.seek(1984, os.SEEK_CUR)

        # 2044 - 3067: uint32_t config_names[256]
        #     Offsets into blob_buffer table
        self.config_names = []
        for i in range(256):
            self.config_names.append(int.from_bytes(fo.read(4), 'big'))

        # 3068 - 4091: uint32_t config_data[256]
        #     Offsets into blob_buffer table
        self.config_data = []
        for i in range(256):
            self.config_data.append(int.from_bytes(fo.read(4), 'big'))

        # 4092 - 4095: reserved
        fo.seek(4, os.SEEK_CUR)

        # 4096 - 12287: VmaDeviceInfoHeader dev_info[256]
        #     The offset in this table is used as 'dev_id' inside
        #     the data streams.
        self.dev_info = []
        for i in range(256):
            self.dev_info.append(VmaDeviceInfoHeader(fo, self))

        # 12288 - header_size: Blob buffer

        # the blob buffer layout is very odd. there appears to be an additional
        # byte of padding at the beginning
        fo.seek(1, os.SEEK_CUR)
        # since byte-wise offsets are used to address the blob buffer, the
        # blob metadata is stored in a hashmap, with the offsets as the keys
        self.blob_buffer = {}
        blob_buffer_current_offset = 1
        while(fo.tell() < self.blob_buffer_offset + self.blob_buffer_size):
            self.blob_buffer[blob_buffer_current_offset] = Blob(fo)
            blob_buffer_current_offset = fo.tell() - self.blob_buffer_offset

        # make sure the file object points at the end of the vma header
        fo.seek(self.header_size, os.SEEK_SET)

        # reread the header and generate a md5 checksum of the data
        self.__gen_md5sum(fo)


    def __gen_md5sum(self, fo):
        p = fo.tell()
        fo.seek(0, os.SEEK_SET)
        h = hashlib.md5()

        data = fo.read(self.header_size)
        data = data[:32] + b'\0' * 16 + data[48:]
        h.update(data)

        self.generated_md5sum = h.digest()

        fo.seek(p, os.SEEK_SET)


class VmaDeviceInfoHeader():
    def __init__(self, fo, vma_header):
        self.__vma_header = vma_header

        # 0 -  3:   devive name (offsets into blob_buffer table)
        self.device_name = int.from_bytes(fo.read(4), 'big')

        # 4 -  7:   reserved
        fo.seek(4, os.SEEK_CUR)

        # 8 - 15:   device size in bytes
        self.device_size = int.from_bytes(fo.read(8), 'big')

        # 16 - 31:   reserved
        fo.seek(16, os.SEEK_CUR)


    def get_name(self):
        name = self.__vma_header.blob_buffer[self.device_name].data
        return name.split(b'\0')[0].decode('utf-8')


class VmaExtentHeader():
    def __init__(self, fo, vma_header):
        self.pos_start = fo.tell()

        # 0 -  3:   magic
        #     VMA extent magic string ("VMAE")
        magic = fo.read(4)
        assert magic == b'VMAE'

        # 4 -  5:   reserved
        fo.seek(2, os.SEEK_CUR)

        # 6 -  7:   block_count
        #     Overall number of contained 4K block
        self.block_count = int.from_bytes(fo.read(2), 'big')

        # 8 - 23:   uuid
        #     Unique ID, Same uuid as used in the VMA header.
        self.uuid = fo.read(16)

        # 24 - 39:   md5sum
        #     Header checksum (from byte 0 to header_size). This field
        #     is filled with zero to generate the checksum.
        self.md5sum = fo.read(16)

        # 40 - 511:   blockinfo[59]
        self.blockinfo = []
        for i in range(59):
            self.blockinfo.append(Blockinfo(fo, vma_header))

        self.pos_end = fo.tell()

        self.__gen_md5sum(fo)


    def __gen_md5sum(self, fo):
        p = fo.tell()
        fo.seek(self.pos_start, os.SEEK_SET)
        h = hashlib.md5()

        data = fo.read(self.pos_end - self.pos_start)
        data = data[:24] + b'\0' * 16 + data[40:]
        h.update(data)

        self.generated_md5sum = h.digest()

        fo.seek(p, os.SEEK_SET)


class Blob():
    def __init__(self, fo):
        # the size of a blob is a two-byte int in LITTLE endian
        # source: original c code of vma-reader
        #    uint32_t size = vmar->head_data[bstart] +
        #        (vmar->head_data[bstart+1] << 8);
        self.size = int.from_bytes(fo.read(2), 'little')
        self.data = fo.read(self.size)


class Blockinfo():
    CLUSTER_SIZE = 65536

    def __init__(self, fo, vma_header):
        self.__vma_header = vma_header

        # 0 - 1:   mask
        self.mask = int.from_bytes(fo.read(2), 'big')

        # 2:   reserved
        fo.seek(1, os.SEEK_CUR)

        # 3:   dev_id
        #    Device ID (offset into dev_info table)
        self.dev_id = int.from_bytes(fo.read(1), 'big')

        # 4 - 7:   cluster_num
        self.cluster_num = int.from_bytes(fo.read(4), 'big')


def extract_configs(fo, args, vma_header):
    """
    Configs in VMA are composed of two blobs. One specifies the config's
    filename and the other contains the config's content.
    The filename seems to be a null-terminated string, while the content is not
    terminated.
    """

    if args.verbose: print('extracting configs...')

    for i in range(256):
        if vma_header.config_names[i] == 0: continue
        config_name = vma_header.blob_buffer[vma_header.config_names[i]].data
        # interpret filename as a null-terminated utf-8 string
        config_name = config_name.split(b'\0')[0].decode('utf-8')

        if args.verbose: print(f'{config_name}...', end='')

        config_data = vma_header.blob_buffer[vma_header.config_data[i]].data

        with open(os.path.join(args.destination, config_name), 'wb') as config_fo:
            config_fo.write(config_data)

        if args.verbose: print(' OK')


def extract(fo, args):
    os.makedirs(args.destination, exist_ok=True)

    fo.seek(0, os.SEEK_END)
    filesize = fo.tell()
    fo.seek(0, os.SEEK_SET)

    vma_header = VmaHeader(fo)

    # check the md5 checksum given in the header with the value calculated from
    # the file
    if not args.skip_hash:
        assert vma_header.md5sum == vma_header.generated_md5sum

    extract_configs(fo, args, vma_header)

    # extract_configs may move the read head somewhere into the blob buffer
    # make sure we are back at the end of the header
    fo.seek(vma_header.header_size, os.SEEK_SET)

    if args.verbose: print('extracting devices...')

    # open file handlers for all devices within the VMA
    # so we can easily append data to arbitrary devices
    device_fos = {}
    for dev_id, dev_info in enumerate(vma_header.dev_info):
        if dev_info.device_size > 0:
            if args.verbose: print(dev_info.get_name())
            device_fos[dev_id] = open(os.path.join(args.destination, dev_info.get_name()), 'wb')

    if args.verbose: print('this may take a while...')

    # used for sanity checking
    cluster_num_prev = -1

    while(fo.tell() < filesize):
        # when there is data to read at this point, we can safely expect a full
        # extent header with additional clusters
        extent_header = VmaExtentHeader(fo, vma_header)
        assert vma_header.uuid == extent_header.uuid

        # check the md5 checksum given in the header with the value calculated from
        # the file
        if not args.skip_hash:
            assert extent_header.md5sum == extent_header.generated_md5sum

        for blockinfo in extent_header.blockinfo:
            if blockinfo.dev_id == 0: continue

            device_fo = device_fos[blockinfo.dev_id]

            # non-sequential clusters encountered, handle this case
            if blockinfo.cluster_num != cluster_num_prev + 1:
                if args.verbose: print('non sequential cluster encountered...')

                cluster_pos = blockinfo.cluster_num * Blockinfo.CLUSTER_SIZE
                if blockinfo.cluster_num > cluster_num_prev:
                    # special case: cluster num is larger than current,
                    # seek forward into file AND, if needed, fill missing size
                    # with zeros
                    device_fo.seek(0, os.SEEK_END)
                    written_size = device_fo.tell()

                    if written_size < cluster_pos:
                        # add padding for missing clusters
                        if args.verbose:
                            print(f'{blockinfo.cluster_num}')
                            print(f'adding {cluster_pos - written_size} bytes'
                                 + 'of padding...')

                        # write padding in chucks of 4096 bytes to avoid
                        # memory errors
                        padding = cluster_pos - written_size
                        while padding > 0:
                            device_fo.write(b'\0' * min(padding, 4096))
                            padding -= 4096

                # seek to start of new cluster
                device_fo.seek(cluster_pos, os.SEEK_SET)

            cluster_num_prev = blockinfo.cluster_num

            for i in range(16):
                # a 2-bytes wide bitmask indicates 4k blocks with only zeros
                if (1 << i) & blockinfo.mask:
                    device_fo.write(fo.read(4096))
                else:
                    device_fo.write(b'\0' * 4096)

    if args.verbose: print('closing file handles...')
    for device_fo in device_fos.values():
        device_fo.close()

    if args.verbose: print('done')


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('filename', type=str)
    parser.add_argument('destination', type=str)
    parser.add_argument('-v', '--verbose', default=False, action='store_true')
    parser.add_argument('-f', '--force', default=False, action='store_true',
            help='overwrite target file if it exists')
    parser.add_argument('--skip-hash', default=False, action='store_true',
            help='do not perform md5 checksum test of data')
    args = parser.parse_args()

    if(not os.path.exists(args.filename)):
        print('Error! Source file does not exist!')
        return 1

    if(os.path.exists(args.destination) and not args.force):
        print('Error! Destination path exists!')
        return 1

    with open(args.filename, 'rb') as fo:
        extract(fo, args)

    return 0

if __name__ == '__main__':
    sys.exit(main())
initial commit 2019-06-21 07:35:42 +00:00			`#!/usr/bin/env python3`
			`import os`
			`import sys`
			`import hashlib`
			`import struct`
			`import argparse`


			`class VmaHeader():`
			`def __init__(self, fo):`
			`# 0 - 3: magic`
			`# VMA magic string ("VMA\x00")`
			`magic = fo.read(4)`
			`assert magic == b'VMA\0'`

			`# 4 - 7: version`
			`# Version number (valid value is 1)`
			`version = int.from_bytes(fo.read(4), 'big')`
			`assert version == 1`

			`# 8 - 23: uuid`
			`# Unique ID, Same uuid is used to mark extents.`
			`self.uuid = fo.read(16)`

			`# 24 - 31: ctime`
			`# Backup time stamp (seconds since epoch)`
			`self.ctime = int.from_bytes(fo.read(8), 'big')`

			`# 32 - 47: md5sum`
			`# Header checksum (from byte 0 to header_size). This field`
			`# is filled with zero to generate the checksum.`
			`self.md5sum = fo.read(16)`

			`# 48 - 51: blob_buffer_offset`
			`# Start of blob buffer (multiple of 512)`
			`self.blob_buffer_offset = int.from_bytes(fo.read(4), 'big')`

			`# 52 - 55: blob_buffer_size`
			`# Size of blob buffer (multiple of 512)`
			`self.blob_buffer_size = int.from_bytes(fo.read(4), 'big')`

			`# 56 - 59: header_size`
			`# Overall size of this header (multiple of 512)`
			`self.header_size = int.from_bytes(fo.read(4), 'big')`

			`# 60 - 2043: reserved`
			`fo.seek(1984, os.SEEK_CUR)`

			`# 2044 - 3067: uint32_t config_names[256]`
			`# Offsets into blob_buffer table`
			`self.config_names = []`
			`for i in range(256):`
			`self.config_names.append(int.from_bytes(fo.read(4), 'big'))`

			`# 3068 - 4091: uint32_t config_data[256]`
			`# Offsets into blob_buffer table`
			`self.config_data = []`
			`for i in range(256):`
			`self.config_data.append(int.from_bytes(fo.read(4), 'big'))`

			`# 4092 - 4095: reserved`
			`fo.seek(4, os.SEEK_CUR)`

			`# 4096 - 12287: VmaDeviceInfoHeader dev_info[256]`
			`# The offset in this table is used as 'dev_id' inside`
			`# the data streams.`
			`self.dev_info = []`
			`for i in range(256):`
			`self.dev_info.append(VmaDeviceInfoHeader(fo, self))`

			`# 12288 - header_size: Blob buffer`

			`# the blob buffer layout is very odd. there appears to be an additional`
			`# byte of padding at the beginning`
			`fo.seek(1, os.SEEK_CUR)`
			`# since byte-wise offsets are used to address the blob buffer, the`
			`# blob metadata is stored in a hashmap, with the offsets as the keys`
			`self.blob_buffer = {}`
			`blob_buffer_current_offset = 1`
			`while(fo.tell() < self.blob_buffer_offset + self.blob_buffer_size):`
			`self.blob_buffer[blob_buffer_current_offset] = Blob(fo)`
			`blob_buffer_current_offset = fo.tell() - self.blob_buffer_offset`

			`# make sure the file object points at the end of the vma header`
			`fo.seek(self.header_size, os.SEEK_SET)`

also check md5 hashes given in headers 2019-09-09 14:44:03 +00:00			`# reread the header and generate a md5 checksum of the data`
			`self.__gen_md5sum(fo)`


			`def __gen_md5sum(self, fo):`
			`p = fo.tell()`
			`fo.seek(0, os.SEEK_SET)`
			`h = hashlib.md5()`

			`data = fo.read(self.header_size)`
			`data = data[:32] + b'\0' * 16 + data[48:]`
			`h.update(data)`

			`self.generated_md5sum = h.digest()`

			`fo.seek(p, os.SEEK_SET)`

initial commit 2019-06-21 07:35:42 +00:00
			`class VmaDeviceInfoHeader():`
			`def __init__(self, fo, vma_header):`
			`self.__vma_header = vma_header`

			`# 0 - 3: devive name (offsets into blob_buffer table)`
			`self.device_name = int.from_bytes(fo.read(4), 'big')`

			`# 4 - 7: reserved`
			`fo.seek(4, os.SEEK_CUR)`

			`# 8 - 15: device size in bytes`
			`self.device_size = int.from_bytes(fo.read(8), 'big')`

			`# 16 - 31: reserved`
			`fo.seek(16, os.SEEK_CUR)`


			`def get_name(self):`
			`name = self.__vma_header.blob_buffer[self.device_name].data`
			`return name.split(b'\0')[0].decode('utf-8')`


			`class VmaExtentHeader():`
			`def __init__(self, fo, vma_header):`
also check md5 hashes given in headers 2019-09-09 14:44:03 +00:00			`self.pos_start = fo.tell()`

initial commit 2019-06-21 07:35:42 +00:00			`# 0 - 3: magic`
			`# VMA extent magic string ("VMAE")`
			`magic = fo.read(4)`
			`assert magic == b'VMAE'`

			`# 4 - 5: reserved`
			`fo.seek(2, os.SEEK_CUR)`

			`# 6 - 7: block_count`
			`# Overall number of contained 4K block`
			`self.block_count = int.from_bytes(fo.read(2), 'big')`

			`# 8 - 23: uuid`
			`# Unique ID, Same uuid as used in the VMA header.`
			`self.uuid = fo.read(16)`

			`# 24 - 39: md5sum`
			`# Header checksum (from byte 0 to header_size). This field`
			`# is filled with zero to generate the checksum.`
			`self.md5sum = fo.read(16)`

			`# 40 - 511: blockinfo[59]`
			`self.blockinfo = []`
			`for i in range(59):`
			`self.blockinfo.append(Blockinfo(fo, vma_header))`

also check md5 hashes given in headers 2019-09-09 14:44:03 +00:00			`self.pos_end = fo.tell()`

			`self.__gen_md5sum(fo)`


			`def __gen_md5sum(self, fo):`
			`p = fo.tell()`
			`fo.seek(self.pos_start, os.SEEK_SET)`
			`h = hashlib.md5()`

			`data = fo.read(self.pos_end - self.pos_start)`
			`data = data[:24] + b'\0' * 16 + data[40:]`
			`h.update(data)`

			`self.generated_md5sum = h.digest()`

			`fo.seek(p, os.SEEK_SET)`

initial commit 2019-06-21 07:35:42 +00:00
			`class Blob():`
			`def __init__(self, fo):`
			`# the size of a blob is a two-byte int in LITTLE endian`
			`# source: original c code of vma-reader`
			`# uint32_t size = vmar->head_data[bstart] +`
			`# (vmar->head_data[bstart+1] << 8);`
			`self.size = int.from_bytes(fo.read(2), 'little')`
			`self.data = fo.read(self.size)`


			`class Blockinfo():`
			`CLUSTER_SIZE = 65536`

			`def __init__(self, fo, vma_header):`
			`self.__vma_header = vma_header`

			`# 0 - 1: mask`
			`self.mask = int.from_bytes(fo.read(2), 'big')`

			`# 2: reserved`
			`fo.seek(1, os.SEEK_CUR)`

			`# 3: dev_id`
			`# Device ID (offset into dev_info table)`
			`self.dev_id = int.from_bytes(fo.read(1), 'big')`

			`# 4 - 7: cluster_num`
			`self.cluster_num = int.from_bytes(fo.read(4), 'big')`


			`def extract_configs(fo, args, vma_header):`
			`"""`
			`Configs in VMA are composed of two blobs. One specifies the config's`
			`filename and the other contains the config's content.`
			`The filename seems to be a null-terminated string, while the content is not`
			`terminated.`
			`"""`

			`if args.verbose: print('extracting configs...')`

			`for i in range(256):`
			`if vma_header.config_names[i] == 0: continue`
			`config_name = vma_header.blob_buffer[vma_header.config_names[i]].data`
			`# interpret filename as a null-terminated utf-8 string`
			`config_name = config_name.split(b'\0')[0].decode('utf-8')`

			`if args.verbose: print(f'{config_name}...', end='')`

			`config_data = vma_header.blob_buffer[vma_header.config_data[i]].data`

			`with open(os.path.join(args.destination, config_name), 'wb') as config_fo:`
			`config_fo.write(config_data)`

			`if args.verbose: print(' OK')`


			`def extract(fo, args):`
			`os.makedirs(args.destination, exist_ok=True)`

			`fo.seek(0, os.SEEK_END)`
			`filesize = fo.tell()`
			`fo.seek(0, os.SEEK_SET)`

			`vma_header = VmaHeader(fo)`

also check md5 hashes given in headers 2019-09-09 14:44:03 +00:00			`# check the md5 checksum given in the header with the value calculated from`
			`# the file`
			`if not args.skip_hash:`
			`assert vma_header.md5sum == vma_header.generated_md5sum`

initial commit 2019-06-21 07:35:42 +00:00			`extract_configs(fo, args, vma_header)`

			`# extract_configs may move the read head somewhere into the blob buffer`
			`# make sure we are back at the end of the header`
			`fo.seek(vma_header.header_size, os.SEEK_SET)`

			`if args.verbose: print('extracting devices...')`

			`# open file handlers for all devices within the VMA`
			`# so we can easily append data to arbitrary devices`
			`device_fos = {}`
			`for dev_id, dev_info in enumerate(vma_header.dev_info):`
			`if dev_info.device_size > 0:`
			`if args.verbose: print(dev_info.get_name())`
			`device_fos[dev_id] = open(os.path.join(args.destination, dev_info.get_name()), 'wb')`

			`if args.verbose: print('this may take a while...')`

			`# used for sanity checking`
			`cluster_num_prev = -1`

			`while(fo.tell() < filesize):`
			`# when there is data to read at this point, we can safely expect a full`
			`# extent header with additional clusters`
			`extent_header = VmaExtentHeader(fo, vma_header)`
			`assert vma_header.uuid == extent_header.uuid`

also check md5 hashes given in headers 2019-09-09 14:44:03 +00:00			`# check the md5 checksum given in the header with the value calculated from`
			`# the file`
			`if not args.skip_hash:`
			`assert extent_header.md5sum == extent_header.generated_md5sum`

initial commit 2019-06-21 07:35:42 +00:00			`for blockinfo in extent_header.blockinfo:`
			`if blockinfo.dev_id == 0: continue`

			`device_fo = device_fos[blockinfo.dev_id]`

			`# non-sequential clusters encountered, handle this case`
			`if blockinfo.cluster_num != cluster_num_prev + 1:`
			`if args.verbose: print('non sequential cluster encountered...')`

			`cluster_pos = blockinfo.cluster_num * Blockinfo.CLUSTER_SIZE`
			`if blockinfo.cluster_num > cluster_num_prev:`
			`# special case: cluster num is larger than current,`
			`# seek forward into file AND, if needed, fill missing size`
			`# with zeros`
			`device_fo.seek(0, os.SEEK_END)`
			`written_size = device_fo.tell()`

			`if written_size < cluster_pos:`
			`# add padding for missing clusters`
			`if args.verbose:`
			`print(f'{blockinfo.cluster_num}')`
			`print(f'adding {cluster_pos - written_size} bytes'`
			`+ 'of padding...')`

			`# write padding in chucks of 4096 bytes to avoid`
			`# memory errors`
			`padding = cluster_pos - written_size`
			`while padding > 0:`
			`device_fo.write(b'\0' * min(padding, 4096))`
			`padding -= 4096`

			`# seek to start of new cluster`
			`device_fo.seek(cluster_pos, os.SEEK_SET)`

			`cluster_num_prev = blockinfo.cluster_num`

			`for i in range(16):`
			`# a 2-bytes wide bitmask indicates 4k blocks with only zeros`
			`if (1 << i) & blockinfo.mask:`
			`device_fo.write(fo.read(4096))`
			`else:`
			`device_fo.write(b'\0' * 4096)`

			`if args.verbose: print('closing file handles...')`
			`for device_fo in device_fos.values():`
			`device_fo.close()`

			`if args.verbose: print('done')`


			`def main():`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument('filename', type=str)`
			`parser.add_argument('destination', type=str)`
			`parser.add_argument('-v', '--verbose', default=False, action='store_true')`
also check md5 hashes given in headers 2019-09-09 14:44:03 +00:00			`parser.add_argument('-f', '--force', default=False, action='store_true',`
			`help='overwrite target file if it exists')`
			`parser.add_argument('--skip-hash', default=False, action='store_true',`
			`help='do not perform md5 checksum test of data')`
initial commit 2019-06-21 07:35:42 +00:00			`args = parser.parse_args()`

			`if(not os.path.exists(args.filename)):`
			`print('Error! Source file does not exist!')`
			`return 1`

			`if(os.path.exists(args.destination) and not args.force):`
			`print('Error! Destination path exists!')`
			`return 1`

			`with open(args.filename, 'rb') as fo:`
			`extract(fo, args)`

			`return 0`

			`if __name__ == '__main__':`
			`sys.exit(main())`