Creating a mirror
mirror-snapshot
It creates hard-linked snapshots like http://alfplayer.com/. Both files are needed.
http://evc.link/mirror-snapshot/mirror-snapshot
http://evc.link/mirror-snapshot/mirror-snapshot-config
Also, wrapping mirror-snapshot, there is mirror-snapshot-sequence (untested) which fetches from Arch and ArchARM servers before syncing from a Parabola server. This script depends also on mirror-rename.
http://evc.link/mirror-snapshot/mirror-snapshot-sequence
http://evc.link/mirror-snapshot/mirror-rename
Python script
Creating a mirror can be done with the script given below.
You should adjust the TARGET_DIR before using the script. You can change the MIRROR ("http://repo.parabolagnulinux.org" in London or "http://parabolaweb.eu" in Nuremberg) used for syncing and raise NUM_WORKERS to speed up the download and verification process if you have multiple cores and fast disk and network I/O available.
To synchronise, just execute the script as a user with writing permissions on TARGET_DIR. You should run this script regularly to keep your mirror up to date by configuring a cron job to syncronise e.g. once a day as a user with write permissions on TARGET_DIR.
Finally, you can set up a webserver such as Apache or nginx to serve files from TARGET_DIR to the public.
#!/usr/bin/env python """ Synchronize files for a mirror for an Archlinux based distribution. The scripts works by downloading the package databases, reading out packages and then downloading these packages. Old packages are deleted before synchronizing, i.e. packages not included in the updated database are available to users until the next time the server is synchronized. """ import hashlib import os import re import shutil import tarfile import urllib2 from Queue import Queue from tempfile import mkdtemp from threading import Thread MIRROR = "http://repo.parabolagnulinux.org" TARGET_DIR = "/srv/http/repo/public" REPOS = ["core", "community", "extra", "libre", "libre-testing", "multilib", "testing"] ARCHES = ["i686", "x86_64", "mips64el"] NUM_WORKERS = 1 def get_arches_repos(): "Create valid combinations of architectures and repositories" return [(arch, repo) for arch in ARCHES for repo in REPOS if not (repo == "multilib" and arch in ["i686", "mips64el"])] def parse_desc(descfile, filename): "Parse desc file found for file with 'filename'" data = {} lines = (line[:-1] for line in descfile.readlines()) last_key = None for num, line in enumerate(lines, 1): if not last_key: if not re.match(r'%[A-Z0-9]+%', line): raise Exception( "Expected key in line %d in file %s, found '%s'" % (num, filename, line)) last_key = line[1:-1].lower() else: if not line: last_key = None else: if last_key not in data: data[last_key] = line else: if type(data[last_key]) == list: data[last_key].append(line) else: data[last_key] = [data[last_key], line] return data def download_file(url, filename, md5sum): """ Download a file from url and store it in filename. Checks the md5sum while downloading. """ try: in_stream = urllib2.urlopen(url) except IOError as error: print("Error downloading %s: %s" % (url, str(error))) return False hash_calc = hashlib.md5() with open(filename, "wb") as target: for chunk in iter(lambda: in_stream.read(10240), ''): hash_calc.update(chunk) target.write(chunk) return md5sum.lower() == hash_calc.hexdigest() def get_repo_location(repo, arch, filename): "Get path to file in folder for repo in filesystem." return os.path.join(TARGET_DIR, repo, "os", arch, filename) def get_pool_location(repo, filename): "Get path to file in package pool in filesystem." pool_dir = "community" if repo == "community" else "packages" return os.path.join(TARGET_DIR, "pool", pool_dir, filename) def get_url(repo, arch, filename): "Get URL for file on mirror." return MIRROR + "/" + repo + "/os/" + arch + "/" + filename def handle_download(file_queue, error_queue): """ Check existence of a package in the filesystem and download it if necessary. """ while True: name, repo, arch, md5sum = file_queue.get() url = get_url(repo, arch, name) repo_name = get_repo_location(repo, arch, name) pool_name = get_pool_location(repo, name) if not os.path.exists(repo_name): if not os.path.exists(pool_name): is_valid = download_file(url, pool_name, md5sum) if not is_valid: error_queue.put(pool_name) if not os.path.lexists(repo_name): os.symlink(pool_name, repo_name) file_queue.task_done() def get_desc_from_database(db_file): "Get file descriptions as dict for each file in database." tar = tarfile.open(db_file) for member in (member for member in tar if member.isfile() and member.name.endswith("/desc")): yield parse_desc(tar.extractfile(member), member.name) def delete_old_packages(): "Delete packages not found in the databases from the filesystem." for arch, repo in get_arches_repos(): repo_dir = get_repo_location(repo, arch, "") if not os.path.exists(repo_dir): continue db_file = os.path.join(repo_dir, repo + ".db") repo_files = set(os.listdir(repo_dir)) db_files = set([repo + ".db"]) for desc in get_desc_from_database(db_file): db_files.add(desc["filename"]) for filename in (repo_files - db_files): os.remove(os.path.join(repo_dir, filename)) try: os.remove(get_pool_location(repo, filename)) except OSError: pass def create_workers(file_queue, error_queue): "Start workers to check and download packages." for _ in range(NUM_WORKERS): worker = Thread( target=handle_download, args=(file_queue, error_queue)) worker.daemon = True worker.start() def download_and_read_databases(file_queue, tempdir): "Download databases to tempdir and add packages to file_queue." for arch, repo in get_arches_repos(): repo_dir = get_repo_location(repo, arch, "") if not os.path.exists(repo_dir): os.makedirs(repo_dir) url = get_url(repo, arch, repo + ".db") in_stream = urllib2.urlopen(url) db_file = os.path.join(tempdir, repo + "-" + arch + ".db") with open(db_file, "wb") as target: shutil.copyfileobj(in_stream, target) for desc in get_desc_from_database(db_file): file_queue.put((desc["filename"], repo, arch, desc["md5sum"])) def move_databases(tempdir): "Move databases from tempdir to repository in filesystem." for arch, repo in get_arches_repos(): tmp_file = os.path.join(tempdir, repo + "-" + arch + ".db") target = get_repo_location(repo, arch, repo + ".db") os.rename(tmp_file, target) def print_errors(error_queue): "Print any errors stored in error_queue." while not error_queue.empty(): print error_queue.get() def main(): "Main function. Executes steps in order." file_queue = Queue() error_queue = Queue() tempdir = mkdtemp() delete_old_packages() create_workers(file_queue, error_queue) download_and_read_databases(file_queue, tempdir) file_queue.join() move_databases(tempdir) os.rmdir(tempdir) print_errors(error_queue) if __name__ == "__main__": main() # vim: set ts=4 sw=4 et: