Q: My rsync over ssh (rsync -e ssh) transfers always seem to hang at the same large file. If I copy that file over by hand using scp or ftp, rsynch will get past that, but will then hang at the next large file.

A: This is a known problem in Tatu Ylönen's (SSH Commmunications Security, Ltd.'s) ssh — and likewise in the OpenBSD Foundation's OpenSSH prior to v. 2.1.1p4 (see below): a select(2) deadlock between ssh and rsync. An effective fix exists for it, but its author never contributed it formally to ssh because the latter became a proprietary product. However, the fix remains available at
http://gcc.gnu.org/ml/gcc/2000-05/msg00248.html .

Its text follows. Name it to "unblock_ssh.c", compile it, put the binary in ssh's directory, and have rsync invoke it rather than ssh directly, i.e., as a wrapper script.


/* Written by Ton Hospel */
/* Hereby put under GNU copyleft */

#include <stdio.h>
#include <string.h>
#include <strings.h>
#include <stdlib.h>
#include <errno.h>
#ifndef HAVE_NO_UNISTD_H
# include <unistd.h>
#endif /* HAVE_NO_UNISTD_H */
#include <fcntl.h>

static char ssh[] = "ssh";

int main(int argc, char **argv) {
    int rc;
    char *ptr, *work;

    rc = fcntl(fileno(stdin), F_SETFL, O_NONBLOCK);
    if (rc < 0) {
        fprintf(stderr, "Could not unblock stdin: %s\n", strerror(errno));
        return 1;
    }
    rc = fcntl(fileno(stdout), F_SETFL, O_NONBLOCK);
    if (rc < 0) {
        fprintf(stderr, "Could not unblock stdout: %s\n", strerror(errno));
        return 1;
    }
    rc = fcntl(fileno(stderr), F_SETFL, O_NONBLOCK);
    if (rc < 0) {
        fprintf(stderr, "Could not unblock stderr: %s\n", strerror(errno));
        return 1;
    }
    
    ptr = rindex(argv[0], '/');
    if (ptr == NULL) ptr = argv[0];
    else ptr++;
    work = malloc(ptr-argv[0]+sizeof(ssh));
    if (!work) {
        fprintf(stderr, "Out of memory. Buy more ?\n");
        return 1;
    }
    memcpy(work, argv[0], ptr-argv[0]);
    memcpy(work+(ptr-argv[0]), ssh, sizeof(ssh));
    argv[0] = work;
    rc = execvp(work, argv);
    fprintf(stderr, "Could not exec %.300s: %s\n", work, strerror(errno));
    return rc;
}


Starting with OpenSSH v. 2.1.1p4, OpenSSH incorporates a patch by H.J. Lu to enact in that package the same fix.