/* This is a quick sort implementation for numpy strings, mainly for
   testing purposes, but could be useful in the future.  It happens to
   be between 2.5x and 3x (for long enough arrays) than the string
   sort in NumPy (based in merge sort).

   Francesc Altet 2008-02-07
 */

#define sSWAP(a,b) {						\
    for (i=0; i<ss; i++) {					\
      ((char *)SWAP_temp)[i] = ((char *)(b))[i];		\
      ((char *)b)[i] = ((char *)(a))[i];			\
      ((char *)a)[i] = ((char *)(SWAP_temp))[i];		\
    }								\
  }


#define opt_memcpy(a,b,n) {				\
    for (i=0; i<(n); i++) {				\
      ((char *)a)[i] = ((char *)(b))[i];		\
    }							\
  }


/* opt_strncmp is an optimized version of strncmp, mainly for easy the
   inlining by the compiler.  I'm not sure whether the inline would
   create problems with other compilers than GCC, but it can safely
   removed and let the compiler decide if it should do the inlining or
   not.  In any case, the inlining can improve the global performance
   by a 20% or more.

   This is only valid for regular strings, but adding support for UCS4
   should be a matter of replacing 'char' by 'PyArray_UCS4', I think.
 */
static inline int opt_strncmp(char *a, char *b, int n) {
  int i;
  for (i=0; i<n; i++) {
    if (a[i] > b[i]) return i;
    if (a[i] < b[i]) return -i;
  }
  return 0;
}


/* start: the address where the data for 1-D string array starts
   ss: the size of the string elements.
   num: the number of elements
  */
int quicksort_string(char *start, int ss, intp num)
{
  char *pl = start;
  char *pr = start + (num - 1) * ss;
  char *vp;
  char *SWAP_temp;
  char *stack[PYA_QS_STACK], **sptr = stack;
  char *pm, *pi, *pj, *pt;
  int i;

  vp = malloc(ss);
  SWAP_temp = malloc(ss);
  for(;;) {
    while (((pr - pl)/ss) > SMALL_QUICKSORT) {
      /* quicksort partition */
      pm = pl + (((pr-pl)/ss) >> 1)*ss;
      if (opt_strncmp(pm, pl, ss) < 0) { sSWAP(pm, pl); }
      if (opt_strncmp(pr, pm, ss) < 0) { sSWAP(pr, pm); }
      if (opt_strncmp(pm, pl, ss) < 0) { sSWAP(pm, pl); }
      opt_memcpy(vp, pm, ss);
      pi = pl;
      pj = pr - ss;
      sSWAP(pm, pj);
      for(;;) {
	do { pi += ss; } while (opt_strncmp(pi, vp, ss) < 0);
	do { pj -= ss; } while (opt_strncmp(vp, pj, ss) < 0);
	if (pi >= pj)  break;
	sSWAP(pi, pj);
      }
      sSWAP(pi, pr-ss);
      /* push largest partition on stack */
      if (pi - pl < pr - pi) {
	*sptr++ = pi + ss;
	*sptr++ = pr;
	pr = pi - ss;
      }else{
	*sptr++ = pl;
	*sptr++ = pi - ss;
	pl = pi + ss;
      }
    }
    /* insertion sort */
    for(pi = pl + ss; pi <= pr;	pi += ss) {
      opt_memcpy(vp, pi, ss);
      for(pj = pi, pt = pi - ss; pj > pl && opt_strncmp(vp, pt, ss) < 0;) {
	opt_memcpy(pj, pt, ss);
	pj -= ss; pt -= ss;
      }
      opt_memcpy(pj, vp, ss);
    }
    if (sptr == stack) break;
    pr = *(--sptr);
    pl = *(--sptr);
  }

  free(vp);
  free(SWAP_temp);

  return 0;
}


