/****************************************************************************
**
** Copyright (C) 2012 Intel Corporation
**
** $QT_BEGIN_LICENSE:BSD$
** You may use this file under the terms of the BSD license as follows:
**
** "Redistribution and use in source and binary forms, with or without
** modification, are permitted provided that the following conditions are
** met:
**   * Redistributions of source code must retain the above copyright
**     notice, this list of conditions and the following disclaimer.
**   * Redistributions in binary form must reproduce the above copyright
**     notice, this list of conditions and the following disclaimer in
**     the documentation and/or other materials provided with the
**     distribution.
**   * Neither the name of Nokia Corporation and its Subsidiary(-ies) nor
**     the names of its contributors may be used to endorse or promote
**     products derived from this software without specific prior written
**     permission.
**
** THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
** "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
** LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
** A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
** OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
** SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
** LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
** DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
** THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
** (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
** OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE."
**
** $QT_END_LICENSE$
**
****************************************************************************/

#define _GNU_SOURCE
#include "forkfd.h"

#include <sys/types.h>
#include <sys/wait.h>
#include <assert.h>
#include <errno.h>
#include <signal.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#define CHILDREN_IN_SMALL_HEADER    16
#define CHILDREN_IN_HEADER          256
#define sizeofarray(array)          (sizeof(array)/sizeof(array[0]))
#define EINTR_LOOP(ret, call) \
    do {                      \
        ret = call;           \
    } while (ret == -1 && errno == EINTR)

/* atomics */
/* we'll use the GCC 4.7 atomic builtins
 * See http://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html#_005f_005fatomic-Builtins
 * Or in texinfo: C Extensions > __atomic Builtins
 */
#if !defined(__GNUC__) || \
    ((__GNUC__ - 0) * 100 + (__GNUC_MINOR__ - 0)) < 407 || \
    defined(__INTEL_COMPILER) || defined(__clang__)
#define __atomic_load_n(ptr,order) *(ptr)
#define __atomic_store_n(ptr,val,order) (*(ptr) = (val), (void)0)
#define __atomic_exchange_n(ptr,val,order) __sync_lock_test_and_set(ptr, val)
#define __atomic_compare_exchange_n(ptr,expected,desired,weak,order1,order2) \
    __sync_bool_compare_and_swap(ptr, *(expected), desired) ? 1 : \
    (*(expected) = *(ptr), 0)
#define __atomic_add_fetch(ptr,val,order) __sync_add_and_fetch(ptr, val)
#endif

typedef struct process_info
{
    int pid;
    int deathPipe;
} ProcessInfo;

struct BigArray;
typedef struct Header
{
    struct BigArray *nextArray;
    int busyCount;
} Header;

typedef struct BigArray
{
    Header header;
    ProcessInfo entries[CHILDREN_IN_HEADER];
} BigArray;

typedef struct SmallArray
{
    Header header;
    ProcessInfo entries[CHILDREN_IN_SMALL_HEADER];
} SmallArray;
static SmallArray children;

static struct sigaction old_sigaction;
static int status = 0;

static int spin_lock()
{
    int expected = 1;
    while (!__atomic_compare_exchange_n(&status, &expected, 2, 1, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) {
        /* yield */
        if (expected == 0)
            return 0;
#if defined(__i386__) || defined(__x86_64__)
        asm volatile ("pause");
#endif
        expected = 1;
    }
    return 1;
}

static void spin_unlock()
{
    assert(status == 2);
    __atomic_store_n(&status, 1, __ATOMIC_RELAXED);
}

static ProcessInfo *tryAllocateInSection(Header *header, ProcessInfo entries[], int maxCount)
{
    /* we use ACQUIRE here because the signal handler might have released the PID */
    int busyCount = __atomic_add_fetch(&header->busyCount, 1, __ATOMIC_ACQUIRE);
    if (busyCount <= maxCount) {
        /* there's an available entry in this section, find it and take it */
        int i;
        for (i = 0; i < maxCount; ++i) {
            /* if the PID is 0, it's free; mark it as used by swapping it with -1 */
            int expected_pid = 0;
            if (__atomic_compare_exchange_n(&entries[i].pid, &expected_pid,
                                            -1, 1 /* true */, __ATOMIC_RELAXED, __ATOMIC_RELAXED))
                return &entries[i];
        }
    }

    /* there isn't an available entry, undo our increment */
    __atomic_add_fetch(&header->busyCount, -1, __ATOMIC_RELAXED);
    return NULL;
}

static ProcessInfo *allocateInfo()
{
    Header *currentHeader = &children.header;

    /* try to find an available entry in the small array first */
    ProcessInfo *info =
            tryAllocateInSection(currentHeader, children.entries, sizeofarray(children.entries));
    if (info != NULL)
        return info;

    /* go on to the next arrays */
    while (info == NULL) {
        BigArray *array = __atomic_load_n(&currentHeader->nextArray, __ATOMIC_ACQUIRE);
        if (array == NULL) {
            /* allocate an array and try to use it */
            BigArray *allocatedArray = calloc(1, sizeof(BigArray));
            if (allocatedArray == NULL)
                return NULL;

            if (__atomic_compare_exchange_n(&currentHeader->nextArray, &array, allocatedArray,
                                             1 /* true */, __ATOMIC_RELEASE, __ATOMIC_ACQUIRE)) {
                /* success */
                array = allocatedArray;
            } else {
                /* failed, the atomic updated 'array' */
                free(allocatedArray);
            }
        }

        currentHeader = &array->header;
        info = tryAllocateInSection(currentHeader, array->entries, sizeof(array->entries));
    }

    return info;
}

static void notifyAndFree(Header *header, ProcessInfo *entry, siginfo_t *info)
{
    ssize_t ret;
    EINTR_LOOP(ret, write(entry->deathPipe, info, sizeof(*info)));
    EINTR_LOOP(ret, close(entry->deathPipe));

    /* reap the child */
    waitpid(entry->pid, NULL, WNOHANG);

    entry->deathPipe = -1;
    entry->pid = 0;

    __atomic_add_fetch(&header->busyCount, -1, __ATOMIC_RELEASE);
    assert(header->busyCount >= 0);
}

static void sigchld_handler(int signum, siginfo_t *info, void *context)
{
    /*
     * This is a signal handler, so we need to be careful about which functions
     * we can call. See the full, official listing in the POSIX.1-2008
     * specification at:
     *   http://pubs.opengroup.org/onlinepubs/9699919799/functions/V2_chap02.html#tag_15_04_03
     *
     */

    if (spin_lock()) {
        /* is this one of our children? */
        BigArray *array;
        int i;

        for (i = 0; i < (int)sizeofarray(children.entries); ++i) {
            int pid = __atomic_load_n(&children.entries[i].pid, __ATOMIC_ACQUIRE);
            if (pid == info->si_pid) {
                /* this is our child, send notification and free up this entry */
                notifyAndFree(&children.header, &children.entries[i], info);
                spin_unlock();
                return;
            }
        }

        /* try the arrays */
        array = __atomic_load_n(&children.header.nextArray, __ATOMIC_ACQUIRE);
        while (array != NULL) {
            for (i = 0; i < (int)sizeofarray(array->entries); ++i) {
                int pid = __atomic_load_n(&array->entries[i].pid, __ATOMIC_ACQUIRE);
                if (pid == info->si_pid) {
                    /* this is our child, send notification and free up this entry */
                    notifyAndFree(&array->header, &array->entries[i], info);
                    spin_unlock();
                    return;
                }
            }

            array = __atomic_load_n(&array->header.nextArray, __ATOMIC_ACQUIRE);
        }

        spin_unlock();
    }

    /* if we got here, we didn't find this PID as any of our children,
     * therefore chain any old handlers that there might have been.
     * Or we're shutting down. */
    if (old_sigaction.sa_handler != SIG_IGN && old_sigaction.sa_handler != SIG_DFL)
        old_sigaction.sa_flags & SA_SIGINFO ?
                    old_sigaction.sa_sigaction(signum, info, context) :
                    old_sigaction.sa_handler(signum);
}


#ifdef __GNUC__
__attribute((destructor, unused)) static void cleanup();
#endif

static void cleanup()
{
    BigArray *array;
    /* This function is not thread-safe!
     * It must only be called when the process is shutting down.
     * At shutdown, we expect no one to be calling forkfd(), so we don't
     * need to be thread-safe with what is done there.
     * But there might still be daemon threads running, which in turn could
     * receive and process a SIGCHLD.
     */

    /* notify the handler that we're no longer in operation */
    spin_lock();
    __atomic_store_n(&status, 0, __ATOMIC_RELAXED);

    /* free any arrays we might have */
    array = children.header.nextArray;
    while (array != NULL) {
        BigArray *next = array->header.nextArray;
        free(array);
        array = next;
    }
}

static int create_pipe(int filedes[], int flags)
{
    int ret;
#ifdef __linux__
    ret = pipe2(filedes, O_CLOEXEC);
    if (ret != -1) {
        if (flags & FFD_NONBLOCK)
            fcntl(filedes[0], F_SETFL, fcntl(filedes[0], F_GETFL) | O_NONBLOCK);
        if ((flags & FFD_CLOEXEC) == 0)
            fcntl(filedes[0], F_SETFD, 0);
        return ret;
    }
    return ret;
#else
    ret = pipe(filedes);
    if (ret == -1)
        return ret;

    fcntl(filedes[1], F_SETFD, FD_CLOEXEC);
    if (flags & FFD_CLOEXEC)
        fcntl(filedes[0], F_SETFD, FD_CLOEXEC);
    if (flags & FFD_NONBLOCK)
        fcntl(filedes[0], F_SETFL, fcntl(filedes[0], F_GETFL) | O_NONBLOCK);
    return ret;
#endif
}

/**
 * @brief forkfd returns a file descriptor representing a child process
 * @return a file descriptor, or -1 in case of failure
 *
 * forkfd() creates a file descriptor that can be used to be notified of when a
 * child process exits. This file descriptor can be monitored using select(2),
 * poll(2) or similar mechanisms.
 *
 * The @a flags parameter can contain the following values ORed to change the
 * behaviour of forkfd():
 *
 * @li @c FFD_NONBLOCK Set the O_NONBLOCK file status flag on the new open file
 * descriptor. Using this flag saves extra calls to fnctl(2) to achieve the same
 * result.
 *
 * @li @c FFD_CLOEXEC Set the close-on-exec (FD_CLOEXEC) flag on the new file
 * descriptor. You probably want to set this flag, since forkfd() does not work
 * if the original parent process dies.
 *
 * The file descriptor returned by forkfd() supports the following operations:
 *
 * @li read(2) When the child process exits, then the buffer supplied to
 * read(2) is used to return information about the status of the child in the
 * form of one @c siginfo_t structure. The buffer must be at least
 * sizeof(siginfo_t) bytes. The return value of read(2) is the total number of
 * bytes read.
 *
 * @li poll(2), select(2) (and similar) The file descriptor is readable (the
 * select(2) readfds argument; the poll(2) POLLIN flag) if the child has exited
 * or signalled via SIGCHLD.
 *
 * @li close(2) When the file descriptor is no longer required it should be closed.
 */
int forkfd(int flags, pid_t *ppid)
{
    sigset_t sigset;
    ProcessInfo *info;
    pid_t pid;
    int fd = -1;
    int pipes[2];

    if (__atomic_exchange_n(&status, 1, __ATOMIC_RELAXED) == 0) {
        /* install our signal handler */
        struct sigaction action;
        memset(&action, 0, sizeof action);
        sigemptyset(&action.sa_mask);
        action.sa_flags = SA_NOCLDSTOP | SA_SIGINFO;
        action.sa_sigaction = sigchld_handler;

        /* ### RACE CONDITION
         * The sigaction function does a memcpy from an internal buffer
         * to old_sigaction, which we use in the SIGCHLD handler. If a
         * SIGCHLD is delivered before or during that memcpy, the handler will
         * see an inconsistent state.
         *
         * There is no solution. pthread_sigmask doesn't work here because the
         * signal could be delivered to another thread.
         */
        sigaction(SIGCHLD, &action, &old_sigaction);

#ifndef __GNUC__
        atexit(cleanup);
#endif
    }

    info = allocateInfo();
    if (info == NULL) {
        errno = ENOMEM;
        goto out;
    }

    /* create the pipes before we fork */
    if (create_pipe(pipes, flags) == -1)
        goto out; /* failed to create the pipes, pass errno */

    /* block the handler temporarily */
    sigemptyset(&sigset);
    sigaddset(&sigset, SIGCHLD);
    pthread_sigmask(SIG_BLOCK, &sigset, NULL);
    spin_lock();

    /* now fork */
    pid = fork();
    if (ppid)
        *ppid = pid;

    if (pid == 0) {
        /* this is the child process, so we should close the
         * pipe and return to the caller */
        int ret;
        EINTR_LOOP(ret, close(pipes[0]));
        EINTR_LOOP(ret, close(pipes[1]));
        fd = FFD_CHILD_PROCESS;
    } else if (pid != -1) {
        /* parent process, no failure */
        info->deathPipe = pipes[1];
        fd = pipes[0];
        __atomic_store_n(&info->pid, pid, __ATOMIC_RELEASE);
    }

    /* unblock the handler */
    spin_unlock();
    pthread_sigmask(SIG_UNBLOCK, &sigset, NULL);

out:
    return fd;
}
