View Single Post
Old 09-07-2016, 04:55 AM   #88
geraintluff
Human being with feelings
 
geraintluff's Avatar
 
Join Date: Nov 2009
Location: mostly inside my own head
Posts: 346
Default FFT for large sizes

I wanted to use (I)FFT to generate samples of size 65536 and above, so I wrote a function to perform larger FFTs.

As much work as possible is still done by the built-in functions - this just Cooley-Tukey factorises the large FFT into two smaller ones.

It performs the permutations (part of the algorithm), and needs some working memory (for any sensible FFT size, 256 slots is all it needs - see fft_big()'s implementation for details). I've thought about a version that doesn't need this working memory, but it may or may not be slower.

Usage:

Code:
fft_big(buffer, 65536, working_mem);
buffer[1] = 1;
ifft_big(buffer, 65536, working_mem);
Code:

Code:
// working_space must be 2*sizeB big
function fft_ab(block, sizeA, sizeB, working_space)
		local(N, i, j, twiddle_amount, twiddle_r, twiddle_i, Nbits, shiftleftbits, shiftrightbits, bitmask, index1, source_index, target_index, bitmask, tmp_r, tmp_i)
		(
	N = sizeA*sizeB;
	i = 0;
	// Perform the stepwise FFTs
	while (i < sizeA) (
		j = 0;
		// Copy to working area
		while (j < sizeB) (
			working_space[j*2] = block[(i + j*sizeA)*2];
			working_space[j*2 + 1] = block[(i + j*sizeA)*2 + 1];
			j += 1;
		);
		fft(working_space, sizeB);
		fft_permute(working_space, sizeB);
		// Copy back, with twiddle factors
		j = 0;
		while (j < sizeB) (
			twiddle_amount = -i*j/N;
			twiddle_r = cos(twiddle_amount*2*$pi);
			twiddle_i = sin(twiddle_amount*2*$pi);
		
			block[(i + j*sizeA)*2] = working_space[j*2]*twiddle_r - working_space[j*2 + 1]*twiddle_i;
			block[(i + j*sizeA)*2 + 1] = working_space[j*2]*twiddle_i + working_space[j*2 + 1]*twiddle_r;
			j += 1;
		);

		i += 1;
	);
	
	// Perform the in-place FFTs
	j = 0;
	while (j < sizeB) (
		fft(block + j*sizeA*2, sizeA);
		fft_permute(block + j*sizeA*2, sizeA);
		j += 1;
	);
	
	// Permute
	Nbits = 1;
	while ((1<<Nbits) < N) (
		Nbits += 1;
	);
	shiftleftbits = 1; // This is the shift required to find the source index for a given target
	bitmask = N - 1;
	while ((1<<shiftleftbits) < sizeA) (
		shiftleftbits += 1;
	);
	shiftrightbits = Nbits - shiftleftbits;
	
	index1 = 1; // Source index
	while (index1 < N) (
		target_index = index1;
		while (
			target_index = ((target_index<<shiftleftbits)&bitmask) + (target_index>>shiftrightbits);
			target_index > index1;
		);
		target_index == index1 ? ( // This index is the shortest in its permutation group
			tmp_r = block[target_index*2];
			tmp_i = block[target_index*2 + 1];
			source_index = ((target_index<<shiftleftbits)&bitmask) + (target_index>>shiftrightbits);
			while (source_index != index1) (
				block[target_index*2] = block[source_index*2];
				block[target_index*2 + 1] = block[source_index*2 + 1];
				target_index = source_index;
				source_index = ((target_index<<shiftleftbits)&bitmask) + (target_index>>shiftrightbits);
			);
			block[target_index*2] = tmp_r;
			block[target_index*2 + 1] = tmp_i;
		);
		index1 += 1;
	);	
);

// working_space needs to be 256 big (128 complex numbers), unless you're doing FFTs of more than 4194304 (2^22)
function fft_big(block, N, working_space) local(sizeB) (
	N > 32768 ? (
		sizeB = 1<<7;
		(N/sizeB) > 32768 ? sizeB = 1<<11;
		(N/sizeB) > 32768 ? sizeB = 1<<13;
		fft_ab(block, N/sizeB, sizeB, working_space);
	) : (
		fft(block, N);
		fft_permute(block, N);
	);
);

// working_space must be 2*sizeB big
function ifft_ab(block, sizeA, sizeB, working_space)
		local(N, i, j, twiddle_amount, twiddle_r, twiddle_i, Nbits, shiftleftbits, shiftrightbits, bitmask, index1, source_index, target_index, bitmask, tmp_r, tmp_i)
		(
	N = sizeA*sizeB;

	// Permute
	Nbits = 1;
	while ((1<<Nbits) < N) (
		Nbits += 1;
	);
	shiftleftbits = 1; // This is the shift required to find the source index for a given target
	bitmask = N - 1;
	while ((1<<shiftleftbits) < sizeB) ( // NOTE: the FFT uses sizeA, the IFFT uses sizeB (inverse shift)
		shiftleftbits += 1;
	);
	shiftrightbits = Nbits - shiftleftbits;
	
	index1 = 1; // Source index
	while (index1 < N) (
		target_index = index1;
		while (
			target_index = ((target_index<<shiftleftbits)&bitmask) + (target_index>>shiftrightbits);
			target_index > index1;
		);
		target_index == index1 ? ( // This index is the shortest in its permutation group
			tmp_r = block[target_index*2];
			tmp_i = block[target_index*2 + 1];
			source_index = ((target_index<<shiftleftbits)&bitmask) + (target_index>>shiftrightbits);
			while (source_index != index1) (
				block[target_index*2] = block[source_index*2];
				block[target_index*2 + 1] = block[source_index*2 + 1];
				target_index = source_index;
				source_index = ((target_index<<shiftleftbits)&bitmask) + (target_index>>shiftrightbits);
			);
			block[target_index*2] = tmp_r;
			block[target_index*2 + 1] = tmp_i;
		);
		index1 += 1;
	);

	// Perform the in-place IFFTs
	j = 0;
	while (j < sizeB) (
		fft_ipermute(block + j*sizeA*2, sizeA);
		ifft(block + j*sizeA*2, sizeA);
		j += 1;
	);

	i = 0;
	// Perform the stepwise IFFTs
	while (i < sizeA) (
		j = 0;
		// Copy to working area, with anti-twiddle factors
		while (j < sizeB) (
			twiddle_amount = i*j/N;
			twiddle_r = cos(twiddle_amount*2*$pi);
			twiddle_i = sin(twiddle_amount*2*$pi);
			
			working_space[j*2] = block[(i + j*sizeA)*2]*twiddle_r - block[(i + j*sizeA)*2 + 1]*twiddle_i;
			working_space[j*2 + 1] = block[(i + j*sizeA)*2]*twiddle_i + block[(i + j*sizeA)*2 + 1]*twiddle_r;
			j += 1;
		);
		fft_ipermute(working_space, sizeB);
		ifft(working_space, sizeB);
		j = 0;
		while (j < sizeB) (
			block[(i + j*sizeA)*2] = working_space[j*2];
			block[(i + j*sizeA)*2 + 1] = working_space[j*2 + 1];
			j += 1;
		);

		i += 1;
	); 
);

function ifft_big(block, N, working_space) local(sizeB) (
	N > 32768 ? (
		sizeB = 1<<7;
		(N/sizeB) > 32768 ? sizeB = 1<<11;
		(N/sizeB) > 32768 ? sizeB = 1<<13;
		ifft_ab(block, N/sizeB, sizeB, working_space);
	) : (
		fft_ipermute(block, N);
		ifft(block, N);
	);
);
The fft_ab() and ifft_ab() functions let you specify the split, but in my speed tests there wasn't much difference (about 3%), and (x, 128) was a good choice. You shouldn't need to use the *_ab() functions, just *_big().
geraintluff is offline   Reply With Quote