The backup data preparation is a two way process:
colapse.awk and
disk-split.awk for correct directories and file sizes.
awk -f colapse.awk
awk -f disk-split.awk
I recommend to run this with a batch, at
or nohup command.
find -type f -or
-type l when collecting files if you want this functionality.
find capabilities. For other unicces the <
code>find command can differ substantially.
# This AWK script will create a compressed backup tree with ZIP archiver
# limiting the max size of the .zip files to GRANULATION.
# If directorty is too big then subdirectories are recursively compressed.
# Files in big directories are splited into several archives if needed.
# Note that ZIP file size is limited to 2^31 bytes (2GB)
# usage: awk -f colapse.awk
# author: (C) 2003, Leon Kos
# License: GPL
# OS: BSD
BEGIN{
SRC="/home/home/staff";
DST="/home/staff-backup";
# File size granuality in Megabytes
GRANULATION=2000;
GRANULATION *= 1024 * 1024;
ZIP_OPTS="-9q";
FS="\t";
system("rm -rf " DST);
findcmd = "find " SRC " -type d -print";
while((findcmd | getline) > 0)
{
sub(SRC, "");
sub(/^\//, "");
dir_name = $0;
cmd = "du -sk \"" SRC "/" dir_name "\"";
cmd | getline;
size = $1 * 1024;
close(cmd);
if (size > GRANULATION) # large directory
{
tree["/" dir_name] = size;
cmd = "mkdir \"" DST "/" dir_name "\"";
print DST "/" dir_name
# print cmd;
system(cmd);
collect_files(SRC, DST, dir_name);
}
else # Small leaf directories are recursively stored into archives
{
depth = split(dir_name, A, "/");
parent="";
for (i = 1; i < depth; i++)
parent = parent "/" A[i];
if (tree[parent] || parent == "")
{
cmd = "cd " SRC "; zip " ZIP_OPTS " -r \"" \
DST "/" dir_name ".zip\" \"" dir_name "\"";
# print cmd, "size:", size;
system(cmd);
}
else
{
# print dir_name " covered by " parent ".zip size:", size
}
}
}
close(findcmd);
exit(0);
}
# Collect files in big directoies and create splitted archives
# without subdirectories
function collect_files(src, dst, dir_name)
{
cmd = "find \"" src "/" dir_name "\" -type f -maxdepth 1 -print0 | xargs -0 stat -f '%z%t%N'";
# print cmd;
total = 0;
archive_number=0;
file_list = "";
while((cmd | getline) > 0)
{
size = $1;
sub(src, "", $2);
sub(/^\//, "", $2);
file_name = $2;
# print dir_name, size, file_name, $0;
file_list = file_list file_name "\n";
total += size;
if (total > GRANULATION)
{
zipcmd = "cd " src "; echo \"" file_list "\" | zip " ZIP_OPTS \
" -@ \"" dst "/" dir_name "/#" archive_number "\"";
# print zipcmd;
system(zipcmd);
file_list = "";
total = 0;
archive_number ++;
}
}
close(cmd);
if (total > 0)
{
zipcmd = "cd " src "; echo \"" file_list "\" | zip " ZIP_OPTS \
" -@ \"" dst "/" dir_name "/#" archive_number "\"";
# print zipcmd;
system(zipcmd);
}
}
# CDR & DVD backup
#
# We assume backup tree with colapse.awk utility.
# The size of compressed archives should be at most of size of the target
# removable media. This will assure that splitted archive will span over
# at most two disks. This recomendation is not obligatory for this utility.
# If you span archive over more disks, more time/disk space is needed for
# spliting. Again. Please note that there is 2^31 file size limitation!
# This mean that zip archive cannot span more than two DVD media!
# Small archives are only symbolicly linked. For creating real disk
# images one should use symbolic link dereference with utility which
# will transfer files on media! For example use "du -L *" in created
# disk directories to verify that the largest disk will fit the media!
# To prevent that archive will be splitted over two disks, with
# a small head on one disk and large tail on another the KEEP_TOGETHER
# parameter assigns alowable space waste (tolerance) to be left on one disk
# just to keep things together (eg. 3% of disk). Please note that space
# waste occurs also in zipsplit utility. Als sizes are given in BYTES.
# I recommend starting with greedy 1% KEEP_TOGETHER and increasing it
# until the number of disks required for whole backup is the same. If
# one wants to rely on zipsplit bin packing with no widow protection, it
# is also admissible to set KEEP_TOGETHER=0;
#
# author: (C) 2003, Leon Kos
# License: GPL
# usage: awk -f disk-split.awk
# OS: BSD
#
BEGIN {
#temporary storage of large zips created with colapse.awk
SRC="/home/staff-backup";
#splitted zips for backup to CDR
DST="/home/dvd-split";
MB=1024*1024
DISK_SIZE=700*MB; #CDR
DISK_SIZE=4400*MB;#DVD
KEEP_TOGETHER = DISK_SIZE/100*2; # Prevent widow archives
FS="\t";
system("rm -rf " DST "; mkdir " DST "; mkdir " DST "/disk00");
disk=0;
total_size = 0;
findcmd = "find " SRC " -type f -name '*.zip' -print0 |" \
" xargs -0 stat -f %z%t%N";
while( (findcmd | getline) > 0)
{
size = $1;
filename = $2;
if ((total_size + size > DISK_SIZE) && (size >= KEEP_TOGETHER))
{
if (DISK_SIZE > 2^31) # zip and awk size limitation
{
cmdsplit = sprintf("zipsplit -n " 2^31 - 1 \
" -r " 2^31 - 1 - (DISK_SIZE - total_size) \
" -b " DST "/disk%02d \"" filename"\"", disk);
}
else
{
cmdsplit = sprintf("zipsplit -n " DISK_SIZE " -r " total_size \
" -b " DST "/disk%02d \"" filename"\"", disk);
}
# print cmdsplit;
cmdsplit | getline;
total_zips = $1;
# print "Total of " total_zips " for " filename;
cmdsplit | getline;
sub("creating: ", "");
archive = 0;
cmd = sprintf("mv \"" $0 "\" \"" DST "/disk%02d/" \
mangle(filename) "." archive ".zip\"", disk);
# print cmd;
system(cmd);
while ( (cmdsplit | getline) > 0) # should run only once
{
disk++;
archive ++;
cmd = sprintf("mkdir " DST "/disk%02d", disk);
print cmd; system(cmd);
sub("creating: ", "");
# We need size for last disk to fill up
cmd = "stat -f %z " $0;
if( cmd | getline total_size == 1)
close(cmd);
else
exit(3);
cmd = sprintf("mv \"" $0 "\" \"" DST "/disk%02d/" \
mangle(filename) "." archive ".zip\"", disk);
# print cmd;
system(cmd);
}
close(cmdsplit);
}
else # just link to original
{
if ((total_size + size > DISK_SIZE) && (size < KEEP_TOGETHER))
{
disk ++;
cmd = sprintf("mkdir " DST "/disk%02d", disk);
system(cmd);
total_size = 0;
print "Widow protection for ", filename;
}
total_size += size;
cmd = sprintf("ln -s \"" filename "\" \"" \
DST "/disk%02d/" mangle(filename) ".zip\"", disk);
system(cmd);
# print size, cmd;
}
}
close(findcmd);
exit(0)
}
function mangle(filename)
{
mangled = filename;
sub(SRC, "", mangled);
sub(DST, "", mangled);
sub(/^\//,"", mangled);
gsub(/\//, "-", mangled);
# sub("#", ".", mangled);
sub(/\.zip$/, "", mangled);
return mangled;
}