initial commit

This commit is contained in:
Jan Wolff 2019-06-21 09:35:42 +02:00
commit 940fe79caf
3 changed files with 336 additions and 0 deletions

18
LICENSE.md Normal file
View file

@ -0,0 +1,18 @@
Copyright (c) 2019 Jan Wolff
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.

15
README.md Normal file
View file

@ -0,0 +1,15 @@
VMA extractor
=============
`vma.py` implements a VMA extraction tool in Python 3.
Usage:
```sh
./vma.py path/to/source.vma path/to/target/directory
```
I think it is pretty important to be able to read Proxmox backups outside of a
Proxmox environment. Yet, porting their VMA implementation to a standalone
tool proved difficult. VMA-Reader and VMA-Writer are implemented as patches to
the Proxmox-patched version and Qemu and are thus very difficult to compile on
non-Proxmox systems.

303
vma.py Executable file
View file

@ -0,0 +1,303 @@
#!/usr/bin/env python3
import os
import sys
import hashlib
import struct
import argparse
class VmaHeader():
def __init__(self, fo):
# 0 - 3: magic
# VMA magic string ("VMA\x00")
magic = fo.read(4)
assert magic == b'VMA\0'
# 4 - 7: version
# Version number (valid value is 1)
version = int.from_bytes(fo.read(4), 'big')
assert version == 1
# 8 - 23: uuid
# Unique ID, Same uuid is used to mark extents.
self.uuid = fo.read(16)
# 24 - 31: ctime
# Backup time stamp (seconds since epoch)
self.ctime = int.from_bytes(fo.read(8), 'big')
# 32 - 47: md5sum
# Header checksum (from byte 0 to header_size). This field
# is filled with zero to generate the checksum.
self.md5sum = fo.read(16)
# 48 - 51: blob_buffer_offset
# Start of blob buffer (multiple of 512)
self.blob_buffer_offset = int.from_bytes(fo.read(4), 'big')
# 52 - 55: blob_buffer_size
# Size of blob buffer (multiple of 512)
self.blob_buffer_size = int.from_bytes(fo.read(4), 'big')
# 56 - 59: header_size
# Overall size of this header (multiple of 512)
self.header_size = int.from_bytes(fo.read(4), 'big')
# 60 - 2043: reserved
fo.seek(1984, os.SEEK_CUR)
# 2044 - 3067: uint32_t config_names[256]
# Offsets into blob_buffer table
self.config_names = []
for i in range(256):
self.config_names.append(int.from_bytes(fo.read(4), 'big'))
# 3068 - 4091: uint32_t config_data[256]
# Offsets into blob_buffer table
self.config_data = []
for i in range(256):
self.config_data.append(int.from_bytes(fo.read(4), 'big'))
# 4092 - 4095: reserved
fo.seek(4, os.SEEK_CUR)
# 4096 - 12287: VmaDeviceInfoHeader dev_info[256]
# The offset in this table is used as 'dev_id' inside
# the data streams.
self.dev_info = []
for i in range(256):
self.dev_info.append(VmaDeviceInfoHeader(fo, self))
# 12288 - header_size: Blob buffer
# the blob buffer layout is very odd. there appears to be an additional
# byte of padding at the beginning
fo.seek(1, os.SEEK_CUR)
# since byte-wise offsets are used to address the blob buffer, the
# blob metadata is stored in a hashmap, with the offsets as the keys
self.blob_buffer = {}
blob_buffer_current_offset = 1
while(fo.tell() < self.blob_buffer_offset + self.blob_buffer_size):
self.blob_buffer[blob_buffer_current_offset] = Blob(fo)
blob_buffer_current_offset = fo.tell() - self.blob_buffer_offset
# make sure the file object points at the end of the vma header
fo.seek(self.header_size, os.SEEK_SET)
class VmaDeviceInfoHeader():
def __init__(self, fo, vma_header):
self.__vma_header = vma_header
# 0 - 3: devive name (offsets into blob_buffer table)
self.device_name = int.from_bytes(fo.read(4), 'big')
# 4 - 7: reserved
fo.seek(4, os.SEEK_CUR)
# 8 - 15: device size in bytes
self.device_size = int.from_bytes(fo.read(8), 'big')
# 16 - 31: reserved
fo.seek(16, os.SEEK_CUR)
def get_name(self):
name = self.__vma_header.blob_buffer[self.device_name].data
return name.split(b'\0')[0].decode('utf-8')
class VmaExtentHeader():
def __init__(self, fo, vma_header):
# 0 - 3: magic
# VMA extent magic string ("VMAE")
magic = fo.read(4)
assert magic == b'VMAE'
# 4 - 5: reserved
fo.seek(2, os.SEEK_CUR)
# 6 - 7: block_count
# Overall number of contained 4K block
self.block_count = int.from_bytes(fo.read(2), 'big')
# 8 - 23: uuid
# Unique ID, Same uuid as used in the VMA header.
self.uuid = fo.read(16)
# 24 - 39: md5sum
# Header checksum (from byte 0 to header_size). This field
# is filled with zero to generate the checksum.
self.md5sum = fo.read(16)
# 40 - 511: blockinfo[59]
self.blockinfo = []
for i in range(59):
self.blockinfo.append(Blockinfo(fo, vma_header))
class Blob():
def __init__(self, fo):
# the size of a blob is a two-byte int in LITTLE endian
# source: original c code of vma-reader
# uint32_t size = vmar->head_data[bstart] +
# (vmar->head_data[bstart+1] << 8);
self.size = int.from_bytes(fo.read(2), 'little')
self.data = fo.read(self.size)
class Blockinfo():
CLUSTER_SIZE = 65536
def __init__(self, fo, vma_header):
self.__vma_header = vma_header
# 0 - 1: mask
self.mask = int.from_bytes(fo.read(2), 'big')
# 2: reserved
fo.seek(1, os.SEEK_CUR)
# 3: dev_id
# Device ID (offset into dev_info table)
self.dev_id = int.from_bytes(fo.read(1), 'big')
# 4 - 7: cluster_num
self.cluster_num = int.from_bytes(fo.read(4), 'big')
def extract_configs(fo, args, vma_header):
"""
Configs in VMA are composed of two blobs. One specifies the config's
filename and the other contains the config's content.
The filename seems to be a null-terminated string, while the content is not
terminated.
"""
if args.verbose: print('extracting configs...')
for i in range(256):
if vma_header.config_names[i] == 0: continue
config_name = vma_header.blob_buffer[vma_header.config_names[i]].data
# interpret filename as a null-terminated utf-8 string
config_name = config_name.split(b'\0')[0].decode('utf-8')
if args.verbose: print(f'{config_name}...', end='')
config_data = vma_header.blob_buffer[vma_header.config_data[i]].data
with open(os.path.join(args.destination, config_name), 'wb') as config_fo:
config_fo.write(config_data)
if args.verbose: print(' OK')
def extract(fo, args):
os.makedirs(args.destination, exist_ok=True)
fo.seek(0, os.SEEK_END)
filesize = fo.tell()
fo.seek(0, os.SEEK_SET)
vma_header = VmaHeader(fo)
extract_configs(fo, args, vma_header)
# extract_configs may move the read head somewhere into the blob buffer
# make sure we are back at the end of the header
fo.seek(vma_header.header_size, os.SEEK_SET)
if args.verbose: print('extracting devices...')
# open file handlers for all devices within the VMA
# so we can easily append data to arbitrary devices
device_fos = {}
for dev_id, dev_info in enumerate(vma_header.dev_info):
if dev_info.device_size > 0:
if args.verbose: print(dev_info.get_name())
device_fos[dev_id] = open(os.path.join(args.destination, dev_info.get_name()), 'wb')
if args.verbose: print('this may take a while...')
# used for sanity checking
cluster_num_prev = -1
while(fo.tell() < filesize):
# when there is data to read at this point, we can safely expect a full
# extent header with additional clusters
extent_header = VmaExtentHeader(fo, vma_header)
assert vma_header.uuid == extent_header.uuid
for blockinfo in extent_header.blockinfo:
if blockinfo.dev_id == 0: continue
device_fo = device_fos[blockinfo.dev_id]
# non-sequential clusters encountered, handle this case
if blockinfo.cluster_num != cluster_num_prev + 1:
if args.verbose: print('non sequential cluster encountered...')
cluster_pos = blockinfo.cluster_num * Blockinfo.CLUSTER_SIZE
if blockinfo.cluster_num > cluster_num_prev:
# special case: cluster num is larger than current,
# seek forward into file AND, if needed, fill missing size
# with zeros
device_fo.seek(0, os.SEEK_END)
written_size = device_fo.tell()
if written_size < cluster_pos:
# add padding for missing clusters
if args.verbose:
print(f'{blockinfo.cluster_num}')
print(f'adding {cluster_pos - written_size} bytes'
+ 'of padding...')
# write padding in chucks of 4096 bytes to avoid
# memory errors
padding = cluster_pos - written_size
while padding > 0:
device_fo.write(b'\0' * min(padding, 4096))
padding -= 4096
# seek to start of new cluster
device_fo.seek(cluster_pos, os.SEEK_SET)
cluster_num_prev = blockinfo.cluster_num
for i in range(16):
# a 2-bytes wide bitmask indicates 4k blocks with only zeros
if (1 << i) & blockinfo.mask:
device_fo.write(fo.read(4096))
else:
device_fo.write(b'\0' * 4096)
if args.verbose: print('closing file handles...')
for device_fo in device_fos.values():
device_fo.close()
if args.verbose: print('done')
def main():
parser = argparse.ArgumentParser()
parser.add_argument('filename', type=str)
parser.add_argument('destination', type=str)
parser.add_argument('-v', '--verbose', default=False, action='store_true')
parser.add_argument('-f', '--force', default=False, action='store_true')
args = parser.parse_args()
if(not os.path.exists(args.filename)):
print('Error! Source file does not exist!')
return 1
if(os.path.exists(args.destination) and not args.force):
print('Error! Destination path exists!')
return 1
with open(args.filename, 'rb') as fo:
extract(fo, args)
return 0
if __name__ == '__main__':
sys.exit(main())