Creating a mirror

Note: See Mirroring on demand instead for an alternative in which packages are fetch only when they are requested by clients of the mirror (basically caching), useful for multiple clients on internal networks.

mirror-snapshot

It creates hard-linked snapshots like http://alfplayer.com/. Both files are needed.

http://evc.link/mirror-snapshot/mirror-snapshot

http://evc.link/mirror-snapshot/mirror-snapshot-config

Also, wrapping mirror-snapshot, there is mirror-snapshot-sequence (untested) which fetches from Arch and ArchARM servers before syncing from a Parabola server. This script depends also on mirror-rename.

http://evc.link/mirror-snapshot/mirror-snapshot-sequence

http://evc.link/mirror-snapshot/mirror-rename

Python script

Creating a mirror can be done with the script given below.

You should adjust the TARGET_DIR before using the script. You can change the MIRROR ("http://repo.parabolagnulinux.org" in London or "http://parabolaweb.eu" in Nuremberg) used for syncing and raise NUM_WORKERS to speed up the download and verification process if you have multiple cores and fast disk and network I/O available.

To synchronise, just execute the script as a user with writing permissions on TARGET_DIR. You should run this script regularly to keep your mirror up to date by configuring a cron job to syncronise e.g. once a day as a user with write permissions on TARGET_DIR.

Finally, you can set up a webserver such as Apache or nginx to serve files from TARGET_DIR to the public.

#!/usr/bin/env python

"""
Synchronize files for a mirror for an Archlinux based distribution.

The scripts works by downloading the package databases, reading out packages
and then downloading these packages. Old packages are deleted before
synchronizing, i.e. packages not included in the updated database are available
to users until the next time the server is synchronized.
"""

import hashlib
import os
import re
import shutil
import tarfile
import urllib2

from Queue import Queue
from tempfile import mkdtemp
from threading import Thread

MIRROR = "http://repo.parabolagnulinux.org"
TARGET_DIR = "/srv/http/repo/public"
REPOS = ["core", "community", "extra", "libre", "libre-testing", "multilib",
        "testing"]
ARCHES = ["i686", "x86_64", "mips64el"]
NUM_WORKERS = 1


def get_arches_repos():
    "Create valid combinations of architectures and repositories"
    return [(arch, repo) for arch in ARCHES for repo in REPOS
            if not (repo == "multilib" and arch in ["i686", "mips64el"])]


def parse_desc(descfile, filename):
    "Parse desc file found for file with 'filename'"
    data = {}
    lines = (line[:-1] for line in descfile.readlines())

    last_key = None
    for num, line in enumerate(lines, 1):
        if not last_key:
            if not re.match(r'%[A-Z0-9]+%', line):
                raise Exception(
                    "Expected key in line %d in file %s, found '%s'"
                    % (num, filename, line))
            last_key = line[1:-1].lower()
        else:
            if not line:
                last_key = None
            else:
                if last_key not in data:
                    data[last_key] = line
                else:
                    if type(data[last_key]) == list:
                        data[last_key].append(line)
                    else:
                        data[last_key] = [data[last_key], line]
    return data


def download_file(url, filename, md5sum):
    """
    Download a file from url and store it in filename. Checks the md5sum
    while downloading.
    """
    try:
        in_stream = urllib2.urlopen(url)
    except IOError as error:
        print("Error downloading %s: %s" % (url, str(error)))
        return False
    hash_calc = hashlib.md5()
    with open(filename, "wb") as target:
        for chunk in iter(lambda: in_stream.read(10240), ''):
            hash_calc.update(chunk)
            target.write(chunk)
    return md5sum.lower() == hash_calc.hexdigest()


def get_repo_location(repo, arch, filename):
    "Get path to file in folder for repo in filesystem."
    return os.path.join(TARGET_DIR, repo, "os", arch, filename)


def get_pool_location(repo, filename):
    "Get path to file in package pool in filesystem."
    pool_dir = "community" if repo == "community" else "packages"
    return os.path.join(TARGET_DIR, "pool", pool_dir, filename)


def get_url(repo, arch, filename):
    "Get URL for file on mirror."
    return MIRROR + "/" + repo + "/os/" + arch + "/" + filename


def handle_download(file_queue, error_queue):
    """
    Check existence of a package in the filesystem and download it if
    necessary.
    """
    while True:
        name, repo, arch, md5sum = file_queue.get()
        url = get_url(repo, arch, name)

        repo_name = get_repo_location(repo, arch, name)
        pool_name = get_pool_location(repo, name)

        if not os.path.exists(repo_name):
            if not os.path.exists(pool_name):
                is_valid = download_file(url, pool_name, md5sum)
                if not is_valid:
                    error_queue.put(pool_name)
            if not os.path.lexists(repo_name):
                os.symlink(pool_name, repo_name)
        file_queue.task_done()


def get_desc_from_database(db_file):
    "Get file descriptions as dict for each file in database."
    tar = tarfile.open(db_file)
    for member in (member for member in tar
            if member.isfile() and member.name.endswith("/desc")):
        yield parse_desc(tar.extractfile(member), member.name)


def delete_old_packages():
    "Delete packages not found in the databases from the filesystem."
    for arch, repo in get_arches_repos():
        repo_dir = get_repo_location(repo, arch, "")
        if not os.path.exists(repo_dir):
            continue

        db_file = os.path.join(repo_dir, repo + ".db")

        repo_files = set(os.listdir(repo_dir))

        db_files = set([repo + ".db"])
        for desc in get_desc_from_database(db_file):
            db_files.add(desc["filename"])

        for filename in (repo_files - db_files):
            os.remove(os.path.join(repo_dir, filename))
            try:
                os.remove(get_pool_location(repo, filename))
            except OSError:
                pass


def create_workers(file_queue, error_queue):
    "Start workers to check and download packages."
    for _ in range(NUM_WORKERS):
        worker = Thread(
                target=handle_download,
                args=(file_queue, error_queue))
        worker.daemon = True
        worker.start()


def download_and_read_databases(file_queue, tempdir):
    "Download databases to tempdir and add packages to file_queue."
    for arch, repo in get_arches_repos():
        repo_dir = get_repo_location(repo, arch, "")
        if not os.path.exists(repo_dir):
            os.makedirs(repo_dir)

        url = get_url(repo, arch, repo + ".db")

        in_stream = urllib2.urlopen(url)
        db_file = os.path.join(tempdir, repo + "-" + arch + ".db")
        with open(db_file, "wb") as target:
            shutil.copyfileobj(in_stream, target)

        for desc in get_desc_from_database(db_file):
            file_queue.put((desc["filename"], repo, arch, desc["md5sum"]))


def move_databases(tempdir):
    "Move databases from tempdir to repository in filesystem."
    for arch, repo in get_arches_repos():
        tmp_file = os.path.join(tempdir, repo + "-" + arch + ".db")
        target = get_repo_location(repo, arch, repo + ".db")
        os.rename(tmp_file, target)


def print_errors(error_queue):
    "Print any errors stored in error_queue."
    while not error_queue.empty():
        print error_queue.get()


def main():
    "Main function. Executes steps in order."
    file_queue = Queue()
    error_queue = Queue()
    tempdir = mkdtemp()

    delete_old_packages()
    create_workers(file_queue, error_queue)
    download_and_read_databases(file_queue, tempdir)

    file_queue.join()
    move_databases(tempdir)

    os.rmdir(tempdir)
    print_errors(error_queue)

if __name__ == "__main__":
    main()
# vim: set ts=4 sw=4 et: