S
Sebastian
I am trying to implement the Baeza-Yates intersection algorithm, but
need some help.
This algorithm is among the fastest known algorithms for intersecting
sorted sets. For small m, Baeza-Yates is O(m * log(n)), while for n =
alpha * m it is O(n), i. e. it performs particularly well when the
one set is sufficiently larger than the other.
E. g., see "Experimental Analysis of a Fast Intersection Algorithm for
Sorted Sequences" by Ricardo Baeza-Yates and Alejandro Salinger
http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.91.7899&rep=rep1&type=pdf
There is a C implementation by Erik Frey, see
http://fawx.com/
http://github.com/erikfrey/themas/tree/master/src/set_intersection/
Unfortunately, I cannot read C very well. Below is my attempt to come
up with a Java version, which does not work (it generates stack
overflows, and my attempts to repair that led to other incorrect
behavior. Also, lowerBound should probably use a binary search).
Perhaps it would be worthwhile to put together a working version
and post it somewhere for everyone's benefit.
-- Sebastian
import java.util.ArrayList;
import java.util.List;
import java.util.Comparator;
public class Intersection
/**
* Finds the intersection of two sorted random access lists.
* @param <T> type of the list elements
* @param list1 a random access list of T sorted by cmp
* @param list2 a random access list of T sorted by cmp
* @param cmp a Comparator for T's
* @return a random access list of T sorted by cmp containing the
intersection of list1 and list2
*/
public static <T> List<T> intersect( List<T> list1, List<T> list2,
Comparator<T> cmp )
{
int size1 = list1.size();
int size2 = list2.size();
int maxSize = Math.max( size1, size2 );
return baeza_intersect( list1, 0, (size1 - 1), list2, 0, (size2 -
1), cmp, new ArrayList<T>( maxSize ) );
}
private static <T> List<T> baeza_intersect( List<T> list1, int
begin1, int end1, List<T> list2, int begin2, int end2,
Comparator<T> cmp,
List<T> out )
{
if( (end1 - begin1) <= (end2 - begin2) ) {
if( begin1 > end1 ) {
return out;
}
// binary probe (could use interpolation probe if there were a
// distance measure on T
int probe1 = begin1 + ((end1 - begin1) / 2);
int probe2 = lowerBound( list2, begin2, end2, list1.get( probe1
), cmp );
if( probe1 < end1 || probe1 < end2 ) {
baeza_intersect( list1, begin1, probe1, list2, begin2, probe2,
cmp, out );
}
if( (probe2 != end2) &&(cmp.compare( list1.get( probe1 ),
list2.get( probe2 ) ) == 0) ) {
out.add( list2.get( probe2 ) );
}
probe1++;
probe2++;
return baeza_intersect( list1, probe1, end1, list2, probe2, end2,
cmp, out );
}
else {
return baeza_intersect( list2, begin2, end2, list1, begin1, end1,
cmp, out );
}
}
private static <T> int lowerBound( List<T> list, int begin, int end,
T value, Comparator<T> cmp )
{
// should use binary search?
int i = begin;
while( (i < end) && (cmp.compare( list.get( i ), value ) < 0) ) {
i++;
}
return i;
}
}
need some help.
This algorithm is among the fastest known algorithms for intersecting
sorted sets. For small m, Baeza-Yates is O(m * log(n)), while for n =
alpha * m it is O(n), i. e. it performs particularly well when the
one set is sufficiently larger than the other.
E. g., see "Experimental Analysis of a Fast Intersection Algorithm for
Sorted Sequences" by Ricardo Baeza-Yates and Alejandro Salinger
http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.91.7899&rep=rep1&type=pdf
There is a C implementation by Erik Frey, see
http://fawx.com/
http://github.com/erikfrey/themas/tree/master/src/set_intersection/
Unfortunately, I cannot read C very well. Below is my attempt to come
up with a Java version, which does not work (it generates stack
overflows, and my attempts to repair that led to other incorrect
behavior. Also, lowerBound should probably use a binary search).
Perhaps it would be worthwhile to put together a working version
and post it somewhere for everyone's benefit.
-- Sebastian
import java.util.ArrayList;
import java.util.List;
import java.util.Comparator;
public class Intersection
/**
* Finds the intersection of two sorted random access lists.
* @param <T> type of the list elements
* @param list1 a random access list of T sorted by cmp
* @param list2 a random access list of T sorted by cmp
* @param cmp a Comparator for T's
* @return a random access list of T sorted by cmp containing the
intersection of list1 and list2
*/
public static <T> List<T> intersect( List<T> list1, List<T> list2,
Comparator<T> cmp )
{
int size1 = list1.size();
int size2 = list2.size();
int maxSize = Math.max( size1, size2 );
return baeza_intersect( list1, 0, (size1 - 1), list2, 0, (size2 -
1), cmp, new ArrayList<T>( maxSize ) );
}
private static <T> List<T> baeza_intersect( List<T> list1, int
begin1, int end1, List<T> list2, int begin2, int end2,
Comparator<T> cmp,
List<T> out )
{
if( (end1 - begin1) <= (end2 - begin2) ) {
if( begin1 > end1 ) {
return out;
}
// binary probe (could use interpolation probe if there were a
// distance measure on T
int probe1 = begin1 + ((end1 - begin1) / 2);
int probe2 = lowerBound( list2, begin2, end2, list1.get( probe1
), cmp );
if( probe1 < end1 || probe1 < end2 ) {
baeza_intersect( list1, begin1, probe1, list2, begin2, probe2,
cmp, out );
}
if( (probe2 != end2) &&(cmp.compare( list1.get( probe1 ),
list2.get( probe2 ) ) == 0) ) {
out.add( list2.get( probe2 ) );
}
probe1++;
probe2++;
return baeza_intersect( list1, probe1, end1, list2, probe2, end2,
cmp, out );
}
else {
return baeza_intersect( list2, begin2, end2, list1, begin1, end1,
cmp, out );
}
}
private static <T> int lowerBound( List<T> list, int begin, int end,
T value, Comparator<T> cmp )
{
// should use binary search?
int i = begin;
while( (i < end) && (cmp.compare( list.get( i ), value ) < 0) ) {
i++;
}
return i;
}
}