51   return a >= b ? a : b;
 
   58   else return (
double)n*
fak(n-1);
 
   73   for (k=0; k<=m-r; k++) {
 
   74     sum+=
binom(m+k,k)*pow((xx+1.0)/2.0,(
double)k);
 
   76   return sum*pow((xx+1.0),(
double)r)*pow(1.0-xx,(
double)(m+1))/(1<<(m+1))/
fak(r); 
 
   80 double _Complex 
regkern(kernel k, 
double xx, 
int p, 
const double *param, 
double a, 
double b)
 
   83   double _Complex sum=0.0;
 
   89   if ((xx>=-0.5+b && xx<=-a) || (xx>=a && xx<=0.5-b)) {
 
   93     sum=(k(-0.5,0,param)+k(0.5,0,param))/2.0
 
   96       sum+=pow(-b/2.0,(
double)r)
 
  102   else if ((xx>-a) && (xx<a)) {
 
  103     for (r=0; r<p; r++) {
 
  104       sum+=pow(a,(
double)r)
 
  106               +k( a,r,param)*
BasisPoly(p-1,r,-xx/a)*(r & 1 ? -1 : 1));
 
  111     sum=(k(-0.5,0,param)+k(0.5,0,param))/2.0
 
  113     for (r=0; r<p; r++) {
 
  114       sum+=pow(b/2.0,(
double)r)
 
  120   return k(xx,0,param);
 
  126 double _Complex 
regkern1(kernel k, 
double xx, 
int p, 
const double *param, 
double a, 
double b)
 
  129   double _Complex sum=0.0;
 
  135   if ((xx>=-0.5+b && xx<=-a) || (xx>=a && xx<=0.5-b))
 
  137     return k(xx,0,param);
 
  139   else if ((xx>-a) && (xx<a))
 
  141     for (r=0; r<p; r++) {
 
  142       sum+=pow(a,(
double)r)
 
  144               +k( a,r,param)*
BasisPoly(p-1,r,-xx/a)*(r & 1 ? -1 : 1));
 
  150     for (r=0; r<p; r++) {
 
  151       sum+=pow(b,(
double)r)
 
  152           *( k(0.5-b,r,param)*
BasisPoly(p-1,r,(xx+0.5)/b)
 
  153               +k(-0.5+b,r,param)*
BasisPoly(p-1,r,-(xx+0.5)/b)*(r & 1 ? -1 : 1));
 
  159     for (r=0; r<p; r++) {
 
  160       sum+=pow(b,(
double)r)
 
  161           *( k(0.5-b,r,param)*
BasisPoly(p-1,r,(xx-0.5)/b)
 
  162               +k(-0.5+b,r,param)*
BasisPoly(p-1,r,-(xx-0.5)/b)*(r & 1 ? -1 : 1));
 
  166   return k(xx,0,param);
 
  170 double _Complex 
regkern2(kernel k, 
double xx, 
int p, 
const double *param, 
double a, 
double b)
 
  173   double _Complex sum=0.0;
 
  178     for (r=0; r<p; r++) {
 
  179       sum+=pow(b,(
double)r)*k(0.5-b,r,param)
 
  184   else if ((a<=xx) && (xx<=0.5-b)) {
 
  185     return k(xx,0,param);
 
  188     for (r=0; r<p; r++) {
 
  189       sum+=pow(-a,(
double)r)*k(a,r,param)
 
  194   else if ((0.5-b<xx) && (xx<=0.5)) {
 
  195     for (r=0; r<p; r++) {
 
  196       sum+=pow(b,(
double)r)*k(0.5-b,r,param)
 
  207 double _Complex 
regkern3(kernel k, 
double xx, 
int p, 
const double *param, 
double a, 
double b)
 
  210   double _Complex sum=0.0;
 
  219   if ((a<=xx) && (xx<=0.5-b)) {
 
  220     return k(xx,0,param);
 
  223     for (r=0; r<p; r++) {
 
  224       sum+=pow(-a,(
double)r)*k(a,r,param)
 
  230   else if ((0.5-b<xx) && (xx<=0.5)) {
 
  231     sum=k(0.5,0,param)*
BasisPoly(p-1,0,-2.0*xx/b+(1.0-b)/b);
 
  233     for (r=0; r<p; r++) {
 
  234       sum+=pow(b/2.0,(
double)r)
 
  244 double _Complex 
linintkern(
const double x, 
const double _Complex *Add,
 
  245   const int Ad, 
const double a)
 
  249   double _Complex f1,f2;
 
  253   f1=Add[r];f2=Add[r+1];
 
  257   return (-f1*c3+f2*c1);
 
  260 double _Complex quadrintkern(
const double x, 
const double _Complex *Add,
 
  261   const int Ad, 
const double a)
 
  265   double _Complex f0,f1,f2;
 
  269   if (r==0) {f0=Add[r+1];f1=Add[r];f2=Add[r+1];}
 
  270   else { f0=Add[r-1];f1=Add[r];f2=Add[r+1];}
 
  275   return (f0*c1*c3/2.0-f1*c2*c3+f2*c2*c1/2.0);
 
  279 double _Complex 
kubintkern(
const double x, 
const double _Complex *Add,
 
  280   const int Ad, 
const double a)
 
  282   double c,c1,c2,c3,c4;
 
  284   double _Complex f0,f1,f2,f3;
 
  287   if (r==0) {f0=Add[r+1];f1=Add[r];f2=Add[r+1];f3=Add[r+2];}
 
  288   else { f0=Add[r-1];f1=Add[r];f2=Add[r+1];f3=Add[r+2];}
 
  296   return(-f0*c1*c3*c4/6.0+f1*c2*c3*c4/2.0-f2*c2*c1*c4/2.0+f3*c2*c1*c3/6.0);
 
  300 double _Complex 
kubintkern1(
const double x, 
const double _Complex *Add,
 
  301   const int Ad, 
const double a)
 
  303   double c,c1,c2,c3,c4;
 
  305   double _Complex f0,f1,f2,f3;
 
  311   { f0=Add[r-1];f1=Add[r];f2=Add[r+1];f3=Add[r+2];}
 
  319   return(-f0*c1*c3*c4/6.0+f1*c2*c3*c4/2.0-f2*c2*c1*c4/2.0+f3*c2*c1*c3/6.0);
 
  323 void quicksort(
int d, 
int t, 
double *x, 
double _Complex *alpha, 
int N)
 
  328   double pivot=x[(N/2)*d+t];
 
  332   double _Complex temp2;
 
  336     while (x[lpos*d+t]<pivot)
 
  338     while (x[rpos*d+t]>pivot)
 
  345         x[lpos*d+k]=x[rpos*d+k];
 
  349       alpha[lpos]=alpha[rpos];
 
  359     quicksort(d,t,x+lpos*d,alpha+lpos,N-lpos);
 
  369   box_index = (
int *) malloc(ths->box_count * 
sizeof(
int));
 
  370   for (t=0; t < ths->box_count; t++)
 
  373   for (l=0; l < ths->
N_total; l++)
 
  376     for (t=0; t < ths->
d; t++)
 
  378       val[t] = ths->
x[ths->
d * l + t] + 0.25 - ths->
eps_B/2.0;
 
  379       ind *= ths->box_count_per_dim;
 
  380       ind += (int) (val[t] / ths->
eps_I);
 
  385   ths->box_offset[0] = 0;
 
  386   for (t=1; t<=ths->box_count; t++)
 
  388     ths->box_offset[t] = ths->box_offset[t-1] + box_index[t-1];
 
  389     box_index[t-1] = ths->box_offset[t-1];
 
  392   for (l=0; l < ths->
N_total; l++)
 
  395     for (t=0; t < ths->
d; t++)
 
  397       val[t] = ths->
x[ths->
d * l + t] + 0.25 - ths->
eps_B/2.0;
 
  398       ind *= ths->box_count_per_dim;
 
  399       ind += (int) (val[t] / ths->
eps_I);
 
  402     ths->box_alpha[box_index[ind]] = ths->
alpha[l];
 
  404     for (t=0; t < ths->
d; t++)
 
  406       ths->box_x[ths->
d * box_index[ind] + t] = ths->
x[ths->
d * l + t];
 
  414 static inline double _Complex 
calc_SearchBox(
int d, 
double *y, 
double *x, 
double _Complex *alpha, 
int start, 
int end_lt, 
const double _Complex *Add, 
const int Ad, 
int p, 
double a, 
const kernel k, 
const double *param, 
const unsigned flags)
 
  416   double _Complex result = 0.0;
 
  421   for (m = start; m < end_lt; m++)
 
  431           r+=(y[l]-x[m*d+l])*(y[l]-x[m*d+l]);
 
  436         result += alpha[m]*k(r,0,param);              
 
  440             result -= alpha[m]*
regkern1(k,r,p,param,a,1.0/16.0); 
 
  447             result -= alpha[m]*
regkern(k,r,p,param,a,1.0/16.0);  
 
  451 #elif defined(NF_QUADR) 
  452             result -= alpha[m]*quadrintkern(r,Add,Ad,a);                
 
  453 #elif defined(NF_LIN) 
  456   #error define interpolation method 
  467   double _Complex val = 0.0;
 
  469   int y_multiind[ths->
d];
 
  470   int multiindex[ths->
d];
 
  473   for (t=0; t < ths->
d; t++)
 
  475     y_multiind[t] = ((y[t] + 0.25 - ths->
eps_B/2.0) / ths->
eps_I);
 
  480     for (y_ind = 
max_i(0, y_multiind[0]-1); y_ind < ths->box_count_per_dim && y_ind <= y_multiind[0]+1; y_ind++){
 
  481       val += 
calc_SearchBox(ths->
d, y, ths->box_x, ths->box_alpha, ths->box_offset[y_ind], ths->box_offset[y_ind+1], ths->
Add, ths->
Ad, ths->
p, ths->
eps_I, ths->
k, ths->
kernel_param, ths->
flags);
 
  486     for (multiindex[0] = 
max_i(0, y_multiind[0]-1); multiindex[0] < ths->box_count_per_dim && multiindex[0] <= y_multiind[0]+1; multiindex[0]++)
 
  487       for (multiindex[1] = 
max_i(0, y_multiind[1]-1); multiindex[1] < ths->box_count_per_dim && multiindex[1] <= y_multiind[1]+1; multiindex[1]++)
 
  489         y_ind = (ths->box_count_per_dim * multiindex[0]) + multiindex[1];
 
  490         val += 
calc_SearchBox(ths->
d, y, ths->box_x, ths->box_alpha, ths->box_offset[y_ind], ths->box_offset[y_ind+1], ths->
Add, ths->
Ad, ths->
p, ths->
eps_I, ths->
k, ths->
kernel_param, ths->
flags);
 
  495     for (multiindex[0] = 
max_i(0, y_multiind[0]-1); multiindex[0] < ths->box_count_per_dim && multiindex[0] <= y_multiind[0]+1; multiindex[0]++)
 
  496       for (multiindex[1] = 
max_i(0, y_multiind[1]-1); multiindex[1] < ths->box_count_per_dim && multiindex[1] <= y_multiind[1]+1; multiindex[1]++)
 
  497         for (multiindex[2] = 
max_i(0, y_multiind[2]-1); multiindex[2] < ths->box_count_per_dim && multiindex[2] <= y_multiind[2]+1; multiindex[2]++)
 
  499           y_ind = ((ths->box_count_per_dim * multiindex[0]) + multiindex[1]) * ths->box_count_per_dim + multiindex[2];
 
  500           val += 
calc_SearchBox(ths->
d, y, ths->box_x, ths->box_alpha, ths->box_offset[y_ind], ths->box_offset[y_ind+1], ths->
Add, ths->
Ad, ths->
p, ths->
eps_I, ths->
k, ths->
kernel_param, ths->
flags);
 
  510 void BuildTree(
int d, 
int t, 
double *x, 
double _Complex *alpha, 
int N)
 
  519     BuildTree(d, (t+1)%d, x+(m+1)*d, alpha+(m+1), N-m-1);
 
  524 double _Complex 
SearchTree(
const int d, 
const int t, 
const double *x,
 
  525   const double _Complex *alpha, 
const double *xmin, 
const double *xmax,
 
  526   const int N, 
const kernel k, 
const double *param, 
const int Ad,
 
  527   const double _Complex *Add, 
const int p, 
const unsigned flags)
 
  530   double Min=xmin[t], Max=xmax[t], Median=x[m*d+t];
 
  531   double a=fabs(Max-Min)/2;
 
  539     return SearchTree(d,(t+1)%d,x+(m+1)*d,alpha+(m+1),xmin,xmax,N-m-1,k,param,Ad,Add,p,flags);
 
  541     return SearchTree(d,(t+1)%d,x,alpha,xmin,xmax,m,k,param,Ad,Add,p,flags);
 
  544     double _Complex result = 0.0;
 
  549       if ( x[m*d+l]>xmin[l] && x[m*d+l]<xmax[l] )
 
  563           r+=(xmin[l]+a-x[m*d+l])*(xmin[l]+a-x[m*d+l]);  
 
  568         result += alpha[m]*k(r,0,param);                         
 
  572             result -= alpha[m]*
regkern1(k,r,p,param,a,1.0/16.0); 
 
  579             result -= alpha[m]*
regkern(k,r,p,param,a,1.0/16.0);  
 
  583 #elif defined(NF_QUADR) 
  584             result -= alpha[m]*quadrintkern(r,Add,Ad,a);                
 
  585 #elif defined(NF_LIN) 
  588   #error define interpolation method 
  593     result += 
SearchTree(d,(t+1)%d,x+(m+1)*d,alpha+(m+1),xmin,xmax,N-m-1,k,param,Ad,Add,p,flags)
 
  594       + 
SearchTree(d,(t+1)%d,x,alpha,xmin,xmax,m,k,param,Ad,Add,p,flags);
 
  600 void fastsum_init_guru(
fastsum_plan *ths, 
int d, 
int N_total, 
int M_total, kernel k, 
double *param, 
unsigned flags, 
int nn, 
int m, 
int p, 
double eps_I, 
double eps_B)
 
  605   int sort_flags_trafo = 0;
 
  606   int sort_flags_adjoint = 0;
 
  608   int nthreads = nfft_get_omp_num_threads();
 
  613     sort_flags_trafo = NFFT_SORT_NODES;
 
  615     sort_flags_adjoint = NFFT_SORT_NODES | NFFT_OMP_BLOCKWISE_ADJOINT;
 
  617     sort_flags_adjoint = NFFT_SORT_NODES;
 
  626   ths->
x = (
double *)
nfft_malloc(d*N_total*(
sizeof(
double)));
 
  627   ths->
alpha = (
double _Complex *)
nfft_malloc(N_total*(
sizeof(
double _Complex)));
 
  629   ths->
y = (
double *)
nfft_malloc(d*M_total*(
sizeof(
double)));
 
  630   ths->
f = (
double _Complex *)
nfft_malloc(M_total*(
sizeof(
double _Complex)));
 
  646       ths->
Ad = 4*(ths->
p)*(ths->
p);
 
  647       ths->
Add = (
double _Complex *)
nfft_malloc((ths->
Ad+5)*(
sizeof(
double _Complex)));
 
  651       if (ths->
k == one_over_x)
 
  656           case 2: delta = 1e-3;
 
  658           case 3: delta = 1e-4;
 
  660           case 4: delta = 1e-5;
 
  662           case 5: delta = 1e-6;
 
  664           case 6: delta = 1e-6;
 
  666           case 7: delta = 1e-7;
 
  668           default: delta = 1e-8;
 
  672         ths->
Ad = 
max_i(10, (
int) ceil(1.4/pow(delta,1.0/4.0)));
 
  673         ths->
Add = (
double _Complex *)
nfft_malloc((ths->
Ad+3)*(
sizeof(
double _Complex)));
 
  674 #elif defined(NF_QUADR) 
  675         ths->
Ad = (int) ceil(2.2/pow(delta,1.0/3.0));
 
  676         ths->
Add = (
double _Complex *)
nfft_malloc((ths->
Ad+3)*(
sizeof(
double _Complex)));
 
  677 #elif defined(NF_LIN) 
  678         ths->
Ad = (int) ceil(1.7/pow(delta,1.0/2.0));
 
  679         ths->
Add = (
double _Complex *)
nfft_malloc((ths->
Ad+3)*(
sizeof(
double _Complex)));
 
  681   #error define NF_LIN or NF_QUADR or NF_KUB 
  686         ths->
Ad = 2*(ths->
p)*(ths->
p);
 
  687         ths->
Add = (
double _Complex *)
nfft_malloc((ths->
Ad+3)*(
sizeof(
double _Complex)));
 
  699   nfft_init_guru(&(ths->
mv1), d, N, N_total, n, m,
 
  701                    PRE_PHI_HUT| PRE_PSI| MALLOC_X | MALLOC_F_HAT| MALLOC_F| FFTW_INIT | FFT_OUT_OF_PLACE,
 
  702                    FFTW_MEASURE| FFTW_DESTROY_INPUT);
 
  703   nfft_init_guru(&(ths->
mv2), d, N, M_total, n, m,
 
  705                    PRE_PHI_HUT| PRE_PSI| MALLOC_X | MALLOC_F_HAT| MALLOC_F| FFTW_INIT | FFT_OUT_OF_PLACE,
 
  706                    FFTW_MEASURE| FFTW_DESTROY_INPUT);
 
  713   ths->
b = (fftw_complex *)
nfft_malloc(n_total*
sizeof(fftw_complex));
 
  715 #pragma omp critical (nfft_omp_critical_fftw_plan) 
  717   fftw_plan_with_nthreads(nthreads);
 
  720   ths->fft_plan = fftw_plan_dft(d,N,ths->
b,ths->
b,FFTW_FORWARD,FFTW_ESTIMATE);
 
  726   if (ths->
flags & NEARFIELD_BOXES)
 
  728     ths->box_count_per_dim = floor((0.5 - ths->
eps_B) / ths->
eps_I) + 1;
 
  730     for (t=0; t<ths->
d; t++)
 
  731       ths->box_count *= ths->box_count_per_dim;
 
  733     ths->box_offset = (
int *) 
nfft_malloc((ths->box_count+1) * 
sizeof(
int));
 
  735     ths->box_alpha = (
double _Complex *)
nfft_malloc(ths->
N_total*(
sizeof(
double _Complex)));
 
  752   nfft_finalize(&(ths->
mv1));
 
  753   nfft_finalize(&(ths->
mv2));
 
  756 #pragma omp critical (nfft_omp_critical_fftw_plan) 
  759   fftw_destroy_plan(ths->fft_plan);
 
  766   if (ths->
flags & NEARFIELD_BOXES)
 
  781   #pragma omp parallel for default(shared) private(j,k,t,r) 
  788         r = ths->
y[j] - ths->
x[k];
 
  792         for (t=0; t<ths->
d; t++)
 
  793           r += (ths->
y[j*ths->
d+t]-ths->
x[k*ths->
d+t])*(ths->
y[j*ths->
d+t]-ths->
x[k*ths->
d+t]);
 
  818   if (ths->
flags & NEARFIELD_BOXES)
 
  841       #pragma omp parallel for default(shared) private(k) 
  842       for (k=-ths->
Ad/2-2; k <= ths->Ad/2+2; k++)
 
  845       #pragma omp parallel for default(shared) private(k) 
  846       for (k=0; k <= ths->
Ad+2; k++)
 
  860     for (t=0; t<ths->
mv1.
d; t++)
 
  868     nfft_precompute_psi(&(ths->
mv1));
 
  871     nfft_precompute_full_psi(&(ths->
mv1));
 
  886     for (t=0; t<ths->
mv2.
d; t++)
 
  894     nfft_precompute_psi(&(ths->
mv2));
 
  897     nfft_precompute_full_psi(&(ths->
mv2));
 
  909   for (t=0; t<ths->
d; t++)
 
  912   #pragma omp parallel 
for default(shared) 
private(j,k,t)
 
  913   for (j=0; j<n_total; j++)
 
  921       for (t=0; t<ths->
d; t++)
 
  923         ths->
b[j] += ((double)(k % (ths->
n)) / (ths->
n) - 0.5) * ((double)(k % (ths->
n)) / (ths->
n) - 0.5);
 
  931   fftw_execute(ths->fft_plan);
 
  954   nfft_adjoint(&(ths->
mv1));
 
  965   #pragma omp parallel for default(shared) private(k) 
  989   #pragma omp parallel for default(shared) private(j,k,t) 
  992     double ymin[ths->
d], ymax[ths->
d]; 
 
  994     if (ths->
flags & NEARFIELD_BOXES)
 
 1000       for (t=0; t<ths->
d; t++)
 
 1002         ymin[t] = ths->
y[ths->
d*j+t] - ths->
eps_I;
 
 1003         ymax[t] = ths->
y[ths->
d*j+t] + ths->
eps_I;
 
 1005       ths->
f[j] = ths->
mv2.
f[j] + 
SearchTree(ths->
d,0, ths->
x, ths->
alpha, ymin, ymax, ths->
N_total, ths->
k, ths->
kernel_param, ths->
Ad, ths->
Add, ths->
p, ths->
flags);