This file was created with JabRef 2.2.
Encoding: UTF8
@ARTICLE{Apostolico2005,
author = {Alberto Apostolico and Raffaele Giancarlo},
title = {Periodicity and Repetitions in Parameterized Strings},
journal = {Electronic Notes in Discrete Mathematics},
year = {2005},
volume = {21},
pages = {227-230},
abstract = {N/A},
crossref = {General Theory of Information Transfer and Combinatorics},
pdf = {D:\Publicaciones\Artículos\Apostolico. Giancarlo. Periodicity and Repetitions in Parameterized Strings.pdf},
url = {http://www.sciencedirect.com/science/article/B75GV-4GST147-13/2/46fc6d000a32de7151a1f5411c013601}
}
@ARTICLE{Hyyro2005,
author = {Heikki Hyyr\"o},
title = {Bit-parallel approximate string matching algorithms with transposition},
journal = {Journal of Discrete Algorithms},
year = {2005},
volume = {3},
pages = {215-229},
number = {2-4},
month = {June},
abstract = {Using bit-parallelism has resulted in fast and practical algorithms
for approximate string matching under Levenshtein edit distance,
which permits a single edit operation to insert, delete or substitute
a character. Depending on the parameters of the search, currently
the fastest non-filtering algorithms in practice are the O(k[left
ceiling]m/w[right ceiling]n) algorithm of Wu and Manber, the O([left
ceiling](k+2)(m-k)/w[right ceiling]n) algorithm of Baeza-Yates and
Navarro, and the O([left ceiling]m/w[right ceiling]n) algorithm of
Myers, where m is the pattern length, n is the text length, k is
the error threshold and w is the computer word size. In this paper
we discuss a uniform way of modifying each of these algorithms to
permit also a fourth type of edit operation: transposing two adjacent
characters in the pattern. This type of edit distance is also known
as Damerau edit distance. In the end we also present an experimental
comparison of the resulting algorithms.},
crossref = {Combinatorial Pattern Matching (CPM) Special Issue},
doi = {10.1016/j.jda.2004.08.006},
keywords = {Levenshtein edit distance; Damerau edit distance; Approximate string
matching; Bit-parallelism},
pdf = {D:\Publicaciones\Artículos\Hyyro H. Bit-parallel Aproximate string matching algorithms with transposition.pdf},
url = {http://www.sciencedirect.com/science/article/B758J-4DBCF4Y-1/2/d2a34c9cb978a70d1a174b18faf3c6c1}
}
@INPROCEEDINGS{Adebiyi2003,
author = {Adebiyi, E.F. and Dipe, T.},
title = {Finding higher order motifs under the levenshtein measure},
booktitle = {Proceedings of the 2003 IEEE Bioinformatics Conference. CSB 2003},
year = {2003},
pages = {583 - 584},
month = {Aug},
abstract = {We study the problem of finding higher order motifs under the levenshtein
measure, otherwise known as the edit distance. In the problem set-up,
we are given N sequences, each of average length n, over a finite
alphabet /spl Sigma/ and thresholds D and q, we are to find composite
motifs that contain motifs of length P (these motifs occur with almost
D differences) in 1 /spl les/ q /spl les/ N distinct sequences. Two
interesting but involved algorithms for finding higher order motifs
under the edit distance was presented by Marsan and Sagot. Their
second algorithm is much more complicated and its complexity is asymptotically
not better. Their first algorithm runs in O(M /spl middot/ N/sup
2/n/sup 1+/spl alpha/ /spl middot/p /spl middot/ pow(/spl epsi/)/)
where p /spl ges/ 2, /spl alpha/ > 0, pow(/spl epsi/) is a concave
function that is less than 1, /spl epsi/= D/P and M is the expected
number of all monad motifs. We present an alternative algorithmic
approach also for Edit distance based on the concept described. The
resulting algorithm is simpler and runs in O(N/sup 2/n/sup 1+p /spl
middot/ pow(/spl epsi/)/) expected time.},
doi = {10.1109/CSB.2003.1227414},
pdf = {D:\Publicaciones\Inproceedings\adebiyi.pdf}
}
@ARTICLE{Aho1972,
author = {Aho, A.V. and Peterson, T.G.},
title = {A minimum distance error-correcting parser for context-free languages},
journal = {SIAM Journal on Computing},
year = {1972},
volume = {1},
pages = {305--312},
number = {4},
abstract = {We assume three types of syntax errors can debase the sentences of
a language generated by a context-free grammar: the replacement of
a symbol by an incorrect symbol, the insertion of an extraneous symbol,
or the deletion of a symbol. We present an algorithm that will parse
any input string to completion finding the fewest possible number
of errors. On a random access computer the algorithm requires time
proportional to the cube of the length of the input.},
doi = {10.1137/0201022},
keywords = {Syntax error; error correction; parsing; context-free grammar; computational
complexity},
publisher = {SIAM}
}
@ARTICLE{Amir2004,
author = {Amihood Amir and Ayelet Butman and Maxime Crochemore and Gad M. Landau
and Mary Schaps},
title = {Two-dimensional pattern matching with rotations},
journal = {Theoretical Computer Science},
year = {2004},
volume = {314},
pages = {173-187},
number = {1-2},
month = {February},
abstract = { The problem of pattern matching with rotation is that of finding
all occurrences of a two-dimensional pattern in a text, in all possible
rotations. We prove an upper and lower bound on the number of such
different possible rotated patterns. Subsequently, given an m x m
array (pattern) and an n x n array (text) over some finite alphabet
[Sigma], we present a new method yielding an O(n2m3) time algorithm
for this problem.},
doi = {http://dx.doi.org/10.1016/j.tcs.2003.10.039},
keywords = {Design and analysis of algorithms; Two-dimensional pattern matching;
Rotation},
pdf = {D:\Publicaciones\Artículos\Amir et al. Two-dimensional pattern matching with rotations.pdf},
url = {http://www.sciencedirect.com/science/article/B6V1G-4BCWWT1-3/2/77c251d04039632956b020d952bcd1ee}
}
@ARTICLE{Apostolico2007a,
author = {Alberto Apostolico and Peter L. Erdos and Moshe Lewenstein},
title = {Parameterized matching with mismatches},
journal = {Journal of Discrete Algorithms},
year = {2007},
volume = {5},
pages = {135-140},
number = { 1},
month = {March},
abstract = {The problem of approximate parameterized string searching consists
of finding, for a given text t=t1t2...tn and pattern p=p1p2...pm
over respective alphabets [Sigma]t and [Sigma]p, the injection [pi]i
from [Sigma]p to [Sigma]t maximizing the number of matches between
[pi]i(p) and titi+1...ti+m-1 (i=1,2,...,n-m+1). We examine the special
case where both strings are run-length encoded, and further restrict
to the case where one of the alphabets is binary. For this case,
we give a construction working in time O(n+(rpxrt)[alpha](rt)log(rt)),
where rp and rt denote the number of runs in the corresponding encodings
for y and x, respectively, and [alpha] is the inverse of the Ackermann's
function.},
doi = {http://dx.doi.org/10.1016/j.jda.2006.03.014},
keywords = {Algorithms; String matching; Pattern matching; Parameterized matching;
Pattern matching with mismatches},
pdf = {D:\Publicaciones\Artículos\Apostolico et al. Parameterizad matching with mismatches.pdf},
url = {http://www.sciencedirect.com/science/article/B758J-4JW128C-6/2/ad01648d986bf4e2f252b1fa8f5116c3}
}
@BOOK{Apostolico1997,
title = {Pattern matching algorithms},
publisher = {Oxford University Press},
year = {1997},
author = {Alberto Apostolico and Zvi Galil},
abstract = {Issues of matching and searching on elementary discrete structures
arise pervasively in computer science and many of its applications,
and their relevance is expected to grow as information is amassed
and shared at an accelerating pace. Several algorithms were discovered
as a result of theseneeds, which in turn created the subfield of
Pattern Matching. This book provides an overview of the current state
of Pattern Matching as seen by specialists who have devoted years
of study to the field. It covers most of the basic principles and
presents material advanced enough to faithfullyportray the current
frontier of research. Because of these recent advances, this is the
right time for a book that brings together information relevant to
both graduate students and specialists in need of an in-depth reference.},
url = {http://books.google.com.co/books?hl=es&lr=&id=mFd_grFyiT4C&oi=fnd&pg=RA2-PA2&sig=b_iozTtafRNpZgKced40Na08sJI&dq=Parallel+computations+of+Levenshtein+distances}
}
@ARTICLE{Apostolico2007,
author = {Alberto Apostolico and Cinzia Pizzi},
title = {Motif discovery by monotone scores},
journal = {Discrete Applied Mathematics},
year = {2007},
volume = {155},
pages = {695-706},
number = {6-7},
month = {April},
abstract = {The detection of frequent patterns such as motifs and higher aggregates
is of paramount interest in biology and invests many other applications
of automated discovery. The problem with its variants is usually
plagued with computational burden. A related difficulty is posed
by the fact, that due to the sheer mole of candidates, the tables
and indices at the outset tend to be bulky, un-manageable, and ultimately
uninformative. For solid patterns, it is possible to compact the
size of statistical indices by resort to certain monotonicities exhibited
by popular scores. The savings come from the fact that these monotonicities
enable one to partition the candidate over-represented words into
families in such a way that it suffices to consider and weigh only
one candidate per family. In this paper, we study the problem of
extracting, from given source x and error threshold k, substrings
of x that occur unusually often in x within k substitutions or mismatches.
Specifically, we assume that the input textstring x of n characters
is produced by an i.i.d. source, and design efficient methods for
computing the probability and expected number of occurrences for
substrings of x with (either exactly or up to) k mismatches. Two
related schemes are presented. In the first one, an O(nk) time preprocessing
of x is developed that supports the following subsequent query: for
any substring w of x arbitrarily specified as input, the probability
of occurrence of w in x within (either exactly or up to) k mismatches
is reported in O(k2) time. In the second scheme, a length or length
range is arbitrarily specified, and the above probabilities are computed
for all substrings of x having length in that range, in overall O(nk)
time. Further, monotonicity conditions are introduced and studied
for the probability and expected frequency of a substring under extension,
increased number of errors, or both. Over intervals of constant frequency
count, these monotonicities translate to some of the scores in use,
thereby reducing the size of tables at the outset and enhancing the
process of discovery. These latter derivations extend to patterns
with mismatches an analysis previously devoted to exact patterns.},
comment = {Computational Molecular Biology Series, Issue V},
doi = {http://dx.doi.org/10.1016/j.dam.2005.09.017},
keywords = {Design and analysis of algorithms; Pattern matching; Pattern discovery;
Motif; Over-represented word; Monotone score; Correction factor},
pdf = {D:\Publicaciones\Artículos\Apostolico. Pizzi. Motif discovery by monotone score.pdf},
url = {http://www.sciencedirect.com/science/article/B6TYW-4M3RNX6-1/2/91ab96ab1ba5d4d1d5a548e9aa5b8f35}
}
@CONFERENCE{Arslan2006,
author = {Arslan, A.N.},
title = {An algorithm for string edit distance allowing substring reversals},
booktitle = {Sixth IEEE Symposium on BioInformatics and BioEngineering. BIBE 2006},
year = {2006},
pages = {220 - 226},
address = {Arlington, VArlington, VA},
abstract = {The edit distance between given two strings X and Y is the minimum
number of edit operations that transform X into Y . Ordinarily, string
editing is based on character insert, delete, and substitute operations.
It has been suggested that extending this model with block (substring)
edits would be useful in applications such as DNA sequence comparison.
In its general form, the resulting problem is NP-hard. However, there
are efficient algorithms when string edits include only character,
and block replacements. We introduce a new edit model which permits
insertions, deletions, and substitutions at character level, and
also reversals of substrings. We present an algorithm whose worst-case
time complexity is O(n2m) where n = |X| le m = |Y |, and we prove
that the average running time of the algorithm is O(nm). Our experiments
on randomly generated strings verify these results. The main contribution
of this paper is that we present an algorithm to find all possible
reversals using a generalized suffix tree, which is fast on average.},
doi = {10.1109/BIBE.2006.253338},
pdf = {D:\Publicaciones\Inproceedings\arslan.pdf}
}
@ARTICLE{Arslan2000,
author = {Abdula N. Arslan and Ömer Egecioglu},
title = {Efficient Algorithms For Normalized Edit Distance},
journal = {Journal of Discrete Algorithms},
year = {2000},
volume = {1},
pages = {3--20},
number = {1},
abstract = {A common model for computing the similarity of two strings X and Y
of lengths m and n respectively, with m n, is to transform X into
Y through a sequence of edit operations, called an edit sequence.
The edit operations are of three types: insertion, deletion, and
substitution. A given cost function assigns a weight to each edit
operation. The amortized weight for a given edit sequence is the
ratio of its weight to its length, and the minimum of this ratio
over all edit sequences is the normalized edit distance. Existing
algorithms for normalized edit distance computation with proven complexity
bounds require O(mn) time in the worst-case. We give provably better
algorithms: an O(mn log n)-time algorithm when the cost function
is uniform, i.e, the weights of edit operations depend only on the
type but not on the individual symbols involved, and an O(mn log
m)-time algorithm when the weights are rational.},
citeseerurl = {citeseer.ist.psu.edu/arslan00efficient.html},
keywords = {dit distance, normalized edit distance, algorithm, dynamic programming,
fractional programming, ratio minimization.},
pdf = {D:\Publicaciones\Artículos\Arslan. Egecioglu. Efficient Algorithms For Normalized Edit Distance.pdf},
url = {citeseer.ist.psu.edu/arslan00efficient.html}
}
@ARTICLE{Atallah11992,
author = {Mikhail J. Atallah1 and Philippe Jacquet and Wojciech Szpankowski},
title = {Pattern matching with mismatches: A probabilistic analysis and a
randomized algorithm},
journal = {Combinatorial Pattern Matching},
year = {1992},
volume = {644/1992},
abstract = {Given a text of length n and a pattern of length m over some (possibly
unbounded) alphabet, we consider the problem of finding all positions
in the text at which the pattern ldquoalmost occursrdquo. Here by
ldquoalmost occursrdquo we mean that at least some fixed fraction
rgr of the characters of the pattern (for example, ge 60% of them)
are equal to their corresponding characters in the text. We design
a randomized algorithm that has O(n log m) worst-case time complexity
and computes with high probability all of the almost-occurrences
of the pattern in the text. This algorithm assumes that the fraction
rgr is given as part of its input, and it works well even for relatively
small values of rgr. It makes no assumptions about the probabilistic
characteristics of the input. Our second contribution deals with
the issue of which values of rgr correspond to the intuitive notion
of similarity between pattern and text, and this leads us to the
development of a probabilistic analysis for the case where both input
strings are random (in the usual, i.e., Bernoulli, model).
The first author's research was supported by the Office of Naval Research
under Grants N0014-84-K-0502 and N0014-36-K-0689, and in part by
AFOSR Grant 90-0107, and the NSF under Grant DCR-8451393, and in
part by Grant R01 LM05118 from the National Library of Medicine.
The second author was supported by NATO Collaborative Grant 0057/89.
The third author's research was supported by AFOSR Grant 90-0107
and NATO Collaborative Grant 0057/89, and, in part by the NSF Grant
CCR-8900305, and by Grant R01 LM05118 from the National Library of
Medicine},
doi = {10.1007/3-540-56024-6}
}
@ARTICLE{Baeza-Yates1994,
author = {Ricardo A. Baeza-Yates and Christian Choffrut and Gaston H. Gonnet},
title = {On Boyer-Moore automata},
journal = {Algorithmica},
year = {1994},
volume = {12},
pages = {268--292},
abstract = {The notion of Boyer-Moore automaton was introduced by Knuth, Morris
and Pratt in their historical paper on fast pattern matching. It
leads to an algorithm that requires more pre-processing but is more
efficient than the original Boyer-Moore's algorithm.},
pdf = {D:\Publicaciones\Artículos\Baeza-Yates et al. On Boyer-Moore automata.pdf},
url = {http://www.dcc.uchile.cl/~rbaeza/cv/journals.html}
}
@ARTICLE{Baeza-Yates1996,
author = {Ricardo A. Baeza-Yates and Chris H. Perleberg},
title = {Fast and practical approximate string matching},
journal = {Information Processing Letters},
year = {1996},
volume = {59},
pages = {21-2},
number = {1},
month = {July},
abstract = {We present new algorithms for approximate string matching based in
simple, but efficient, ideas. First, we present an algorithm for
string matching with mismatches based in arithmetical operations
that runs in linear worst case time for most practical cases. This
is a new approach to string searching. Second, we present an algorithm
for string matching with errors based on partitioning the pattern
that requires linear expected time for typical inputs.},
doi = {http://dx.doi.org/10.1016/0020-0190(96)00083-X},
keywords = {Algorithms; Approximate string matching},
pdf = {D:\Publicaciones\Artículos\Baeza-Yates. Perleberg. Fast And practical approximate string matching.pdf},
url = {http://www.sciencedirect.com/science/article/B6V0F-3VVCJTY-5/2/1a81a71d32b703f975a0198be573494e}
}
@ARTICLE{Baeza-Yates1995,
author = {Ricardo Baeza-Yates and Rene Schott},
title = {Parallel searching in the plane},
journal = {Computational Geometry,},
year = {1995},
volume = {5},
pages = {143-154},
number = {3},
abstract = {In this paper we investigate parallel searching in the plane using
robots as searchers. We show that a huge number of robots are not
necessary for several problems and under different assumptions. This
corresponds to real situations since, actually, the number of processors
of parallel machines is fixed and independent of the dimension of
the problem to be solved.},
doi = {http://dx.doi.org/10.1016/0925-7721(95)00003-R},
pdf = {D:\Publicaciones\Artículos\Baeza-Yates. Schott. Parallel searching in the plane.pdf},
url = {http://www.sciencedirect.com/science/article/B6TYS-3YVD08Y-5/2/74e38c5aa555040a375a9910cf70f6b4}
}
@ARTICLE{,
author = {Brenda S. Baker},
title = {Parameterized Pattern Matching: Algorithms and Applications},
journal = {Journal of Computer and System Sciences},
year = {1996},
volume = {52},
pages = {28-42},
number = {1},
abstract = {The problem of finding sections of code that either are identical
or are related by the systematic renaming of variables or constants
can be modeled in terms ofparameterized strings(p-strings) andparameterized
matches(p-matches). P-strings are strings over two alphabets, one
of which represents parameters. Two p-strings are aparameterized
match(p-match) if one p-string is obtained by renaming the parameters
of the other by a one-to-one function. In this paper, we investigate
parameterized pattern matching via parameterized suffix trees (p-suffix
trees). We give two algorithms for constructing p-suffix trees: one
(eager) that runs in linear time for fixed alphabets, and another
that uses auxiliary data structures and runs inO(n log(n)) time for
variable alphabets, wherenis input length. We show that using a p-suffix
tree for a pattern p-stringP, it is possible to search for all p-matches
ofPwithin a text p-stringTin space linear in P and time linear in
T for fixed alphabets, orO(T log(min(P, [sigma])) time andO(P) space
for variable alphabets, where[sigma]is the sum of the alphabet sizes.
The simpler p-suffix tree construction algorithmeagerhas been implemented,
and experiments show it to be practical. Since it runs faster than
predicted by the above worst-case bound, we reanalyze the algorithm
and show thateagerruns in timeO(min(t S+m(t, S) | t>0) log [sigma])),
where for an input p-stringS, m(t, S) is the number of maximal p-matches
of length at leasttthat occur withinS, and[sigma]is the sum of the
alphabet sizes. Experiments with the author's programdup(B. Baker,in"Comput.
Sci. Statist.," Vol. 24, 1992) for finding all maximal p-matches
within a p-string have foundm(t, S) to be less than S in practice
unlesstis small.},
doi = {http://dx.doi.org/10.1006/jcss.1996.0003},
pdf = {D:\Publicaciones\Artículos\baker.pdf},
url = {http://www.sciencedirect.com/science/article/B6WJ0-45NJMKJ-J/2/0b04b8bd4e79a7b2284d4a52c451e50d}
}
@INPROCEEDINGS{Bar-Yossef2004,
author = {Bar-Yossef and Z. Jayram and T. S. Krauthgamer and R. Kumar},
title = {Approximating edit distance efficiently},
booktitle = {Proceedings 45th Annual IEEE Symposium on Foundations of Computer
Science, 2004},
year = {2004},
pages = {550 - 559},
address = {Haifa, Israel},
month = {Oct.},
organization = {Dept. of Electr. Eng., Technion},
doi = {10.1109/FOCS.2004.14},
pdf = {D:\Publicaciones\Inproceedings\Bar-Yossef et al. Approximating Edit Distance Efficiently.pdf}
}
@TECHREPORT{Boyar1994,
author = {Joan Boyar and Kim S. Larsen},
title = {Faster parallel computation of edit distances},
institution = {dept. Math. CS, Odense Univ., Denmark},
year = {1994},
abstract = {Fast parallel algorithms are given for the longest common subsequence
problem and the string editing problem with bounded weights. In the
COMMON PRAM model, the algorithm for the longest common subsequence
problem takes time O(log m), where m is the length of the shorter
string, while the algorithm for the string editing problem with bounded
weights takes time O(max(log m, log n/ log log n)), where m is the
length of the shorter string and n is the length of the longer string.},
citeseerurl = {citeseer.ist.psu.edu/boyar94faster.html},
pdf = {D:\Publicaciones\TechReport\Boyard J. Larsen K. Faster parallel computation of edit distances.pdf},
url = {citeseer.ist.psu.edu/boyar94faster.html}
}
@ARTICLE{Bunke1995,
author = {H. Bunke and J. Csirik},
title = {An improved algorithm for computing the edit distance of run-length
coded strings},
journal = {Inf. Process. Lett.},
year = {1995},
volume = {54},
pages = {93--96},
number = {2},
abstract = {N/A},
address = {Amsterdam, The Netherlands, The Netherlands},
doi = {http://dx.doi.org/10.1016/0020-0190(95)00005-W},
issn = {0020-0190},
keywords = {Algorithms; Approximate string matching; String edit distance},
pdf = {D:\Publicaciones\Artículos\Bunke H. Csirik J. An Improved algorithm for computing the edit distance of run-length coded strings.pdf},
publisher = {Elsevier North-Holland, Inc.}
}
@ARTICLE{Bunke1995a,
author = {Bunke, H. and Csirik, J.},
title = {Parametric string edit distance and its application to pattern recognition},
journal = {IEEE Transactions on Systems, Man and Cybernetics},
year = {1995},
volume = {25},
pages = {202 - 206},
number = {1},
month = {Jan},
abstract = {A generalized version of the string matching algorithm by Wagner and
Fischer (1974) is proposed. It is based on a parametrization of the
edit cost. We assume constant cost for any delete and insert operation,
but the cost for replacing a symbol is given as a parameter τ. For
any two strings A and B, our algorithm computes their edit distance
in terms of the parameter τ. We give the new algorithm, study some
of its properties, and discuss potential applications to pattern
recognition},
doi = {10.1109/21.362950},
pdf = {D:\Publicaciones\Artículos\Bunke. c. Paramtric.pdf}
}
@MASTERSTHESIS{Bustos2003,
author = {Javier Alejandro Bustos},
title = {Estudio Comparativo de algoritmos indexados para búsqueda aproximada
de texto},
school = {Universidad de Chile},
year = {2003},
abstract = {El problema de la búsqueda aproximada en texto aparece en una gran
cantidad de áreas de
las Ciencias de la Computación; se aplica, entre otros, a recuperación
de texto y biología
computacional.},
pdf = {D:\Publicaciones\MasterThesis\Bustos J. A. ESTUDIO COMPARATIVO DE ALGORITMOS INDEXADOS.pdf},
url = {http://www.uchile.cl/}
}
@ARTICLE{Chavez2003,
author = {Edgar Ch\'avez and Gonzalo Navarro},
title = {Probabilistic proximity search: Fighting the curse of dimensionality
in metric spaces},
journal = {Information Processing Letters,Information Processing Letters},
year = {2003},
volume = {85},
pages = {39-46},
number = {1},
month = {January},
abstract = {Proximity searches become very difficult on "high dimensional" metric
spaces, that is, those whose histogram of distances has a large mean
and/or a small variance. This so-called "curse of dimensionality",
well known in vector spaces, is also observed in metric spaces. The
search complexity grows sharply with the dimension and with the search
radius. We present a general probabilistic framework applicable to
any search algorithm and whose net effect is to reduce the search
radius. The higher the dimension, the more effective the technique.
We illustrate empirically its practical performance on a particular
class of algorithms, where large improvements in the search time
are obtained at the cost of a very small error probability.},
doi = {http://dx.doi.org/10.1016/S0020-0190(02)00344-7},
keywords = {Metric spaces; Proximity searching; Nearest neighbor searching; Distance
based indexing; Probabilistic algorithms; Approximation algorithms},
pdf = {D:\Publicaciones\Artículos\Chávez. Navarro Probabilistic proximity search.pdf},
url = {http://www.sciencedirect.com/science/article/B6V0F-475BCSD-4/2/4b9e5165db1137830dc22bfb2dbe10c3}
}
@ARTICLE{Chang1992,
author = {William I. Chang and Jordan Lampe},
title = {heoretical and empirical comparisons of approximate string matching
algorithms},
journal = {ombinatorial Pattern MatchingCombinatorial Pattern Matching},
year = {1992},
volume = {644/1992},
pages = {175-184},
abstract = {We study in depth a model of non-exact pattern matching based on edit
distance, which is the minimum number of substitutions, insertions,
and deletions needed to transform one string of symbols to another.
More precisely, the k differences approximate string matching problem
specifies a text string of length n, a pattern string of length m,
the number k of differences (substitutions, insertions, deletions)
allowed in a match, and asks for all locations in the text where
a match occurs. We have carefully implemented and analyzed various
O(kn) algorithms based on dynamic programming (DP), paying particular
attention to dependence on b the alphabet size. An empirical observation
on the average values of the DP tabulation makes apparent each algorithm's
dependence on b. A new algorithm is presented that computes much
fewer entries of the DP table. In practice, its speedup over the
previous fastest algorithm is 2.5X for binary alphabet; 4X for four-letter
alphabet; 10X for twenty-letter alphabet. We give a probabilistic
analysis of the DP table in order to prove that the expected running
time of our algorithm (as well as an earlier ldquocut-offrdquo algorithm
due to Ukkonen) is O(kn) for random text. Furthermore, we give a
heuristic argument that our algorithm is O(kn/(radicb-1)) on the
average, when alphabet size is taken into consideration.
This research was conducted at the University of California, Berkeley,
and was supported in part by Department of Energy grant DE-FG03-90ER60999},
doi = {10.1007/3-540-56024-6_14}
}
@ARTICLE{Chrobak2005,
author = {Marek Chrobak and Petr Kolman and Ji\~r\~i Sgall},
title = {The greedy algorithm for the minimum common string partition problem},
journal = {ACM Trans. Algorithms},
year = {2005},
volume = {1},
pages = {350--366},
number = {2},
abstract = {In the Minimum Common String Partition problem (MCSP), we are given
two strings on input, and we wish to partition them into the same
collection of substrings, minimizing the number of the substrings
in the partition. This problem is NP-hard, even for a special case,
denoted 2-MCSP, where each letter occurs at most twice in each input
string. We study a greedy algorithm for MCSP that at each step extracts
a longest common substring from the given strings. We show that the
approximation ratio of this algorithm is between Ω(n0.43) and O(n0.69).
In the case of 2-MCSP, we show that the approximation ratio is equal
to 3. For 4-MCSP, we give a lower bound of Ω(log n).},
address = {New York, NY, USA},
doi = {http://doi.acm.org/10.1145/1103963.1103971},
issn = {1549-6325},
publisher = {ACM Press}
}
@ARTICLE{Chavez2006,
author = {Edgar Ch{\'a}vez and Gonzalo Navarro},
title = {A Metric Index for Approximate String Matching},
journal = {Theoretical Computer Science},
year = {2006},
volume = {352},
pages = {266--279},
number = {Issues 1-3},
month = {March},
abstract = {We present a radically new indexing approach for approximate string
matching. The scheme uses the metric properties of the edit distance
and can be applied to any other metric between strings. We build
a metric space where the sites are the nodes of the suffix tree of
the text, and the approximate query is seen as a proximity query
on that metric space. This permits us finding the occ occurrences
of a pattern of length m, permitting up to r differences, in a text
of length n over an alphabet of size [sigma], in average time O(m1+[epsilon]+occ)
for any [epsilon]>0, if and . The index works well up to , where
it achieves its maximum average search complexity . The construction
time of the index is and its space is . This is the first index achieving
average search time polynomial in m and independent of n, for . Previous
methods achieve this complexity only for . We also present a simpler
scheme needing O(n) space.},
address = {London, UK},
doi = {http://dx.doi.org/10.1016/j.tcs.2005.11.037},
isbn = {3-540-43400-3},
keywords = {Indexed approximate string matching; Metric spaces; Metric indexes;
Suffix trees; Suffix arrays},
pdf = {D:\Publicaciones\Artículos\A Metric Index for Aproximate String Matching.pdf},
publisher = {Springer-Verlag},
url = {http://www.sciencedirect.com/science/article/B6V1G-4HXBGSH-1/2/889a42ca051eca9d71743577b1672cc4}
}
@ARTICLE{Crochemore2001,
author = {Maxime Crochemore and Costas S. Iliopoulos and Yoan J. Pinz\'on and
James F. Reid},
title = {A fast and practical bit-vector algorithm for the Longest Common
Subsequence problem},
journal = {Information Processing Letters},
year = {2001},
volume = {80},
pages = {279-285},
number = {6},
month = {December},
abstract = {This paper presents a new practical bit-vector algorithm for solving
the well-known Longest Common Subsequence (LCS) problem. Given two
strings of length m and n, n[greater-or-equal, slanted]m, we present
an algorithm which determines the length p of an LCS in O(nm/w) time
and O(m/w) space, where w is the number of bits in a machine word.
This algorithm can be thought of as column-wise "parallelization"
of the classical dynamic programming approach. Our algorithm is very
efficient in practice, where computing the length of an LCS of two
strings can be done in linear time and constant (additional/working)
space by assuming that m[less-than-or-equals, slant]w.},
doi = {http://dx.doi.org/10.1016/S0020-0190(01)00182-X},
keywords = {Algorithms; Longest Common Subsequence; Dynamic programming; Bit-vector
algorithm},
pdf = {D:\Publicaciones\Artículos\Crochemore et al. A fast and practical bit-vector algorithm for the LCS.pdf},
url = {http://www.sciencedirect.com/science/article/B6V0F-4475W6K-2/2/8ff4c255a0756ed7bdfe7fe77fe2f964}
}
@CONFERENCE{Egecioglu1996,
author = {Egecioglu, O. and Ibel, M.},
title = {Parallel algorithms for fast computation of normalized edit distances},
booktitle = {Eighth IEEE Symposium on Parallel and Distributed Processing},
year = {1996},
pages = { 496 - 503},
address = {New Orleans, LA},
month = {Oct.},
abstract = {The authors give work-optimal and polylogarithmic time parallel algorithms
for solving the normalized edit distance problem. The normalized
edit distance between two strings X and Y with lengths n⩾m is
the minimum quotient of the sum of the costs of edit operations transforming
X into Y by the length of the edit path corresponding to those edit
operations. Marzal and Vidal (1993) proposed a sequential algorithm
with a time complexity of O(nm2). They show that this algorithm can
be parallelized work-optimally on an array of n (or m) processors,
and on a mesh of n×m processors. They then propose a sublinear time
algorithm that is almost work-optimal: using O(mn1.75) processors,
the time complexity of the algorithm is O(n0.75 log n) and the total
number of operations is O (mn 2.5 log n). This algorithm runs on
a CREW PRAM, but is likely to work on weaker PRAM models and hypercubes
with minor modifications. Finally, they present a polylogarithmic
O(log2 n) time algorithm based on matrix multiplication which runs
on a O(n6/log n) processor hypercube},
doi = {10.1109/SPDP.1996.57037},
pdf = {D:\Publicaciones\Inproceedings\Egecioglu. Ibel. Parallel algorithms.pdf}
}
@ARTICLE{Ehrenfeucht1988,
author = {A. Ehrenfeucht and D. Haussler},
title = {A new distance metric on strings computable in linear time},
journal = {Discrete Appl. Math.},
year = {1988},
volume = {20},
pages = {191--203},
number = {3},
abstract = {We describe a new metric for sequence comparison that emphasizes global
similarity over sequential matching at the local level. It has the
advantage over the Levenshtein metric that strings of lengths n and
m can be compared in time proportional to n + m instead of nm. Various
mathematical properties of the metric are established.},
address = {Amsterdam, The Netherlands, The Netherlands},
doi = {http://dx.doi.org/10.1016/0166-218X(88)90076-5},
issn = {0166-218X},
keywords = {equence comparison; string matching; partial match; distance metric;
clustering; text processing; editing; spelling correction},
publisher = {Elsevier Science Publishers B. V.}
}
@ARTICLE{Ferragina2006,
author = {P. Ferragina and G. Manzini and V. M{\"a}kinen and G. Navarro},
title = {Compressed Representations of Sequences and Full-Text Indexes},
journal = {ACM Transactions on Algorithms (TALG)},
year = {2006},
note = {To appear},
abstract = {Given a sequence S = s(1)s(2) ... s(n) of integers smaller than r
= O(polylog(n)), we show how S can be represented using nH0(S) +
o(n) bits, so that we can know any s(i), as well as answer rank and
select queries on S, in constant time. H0(S) is the zero-order empirical
entropy of S and nH0(S) provides an Information Theoretic lower bound
to the bit storage of any sequence S via a fixed encoding of its
symbols. This extends previous results on binary sequences, and improves
previous results on general sequences where those queries are answered
in O(log r) time. For larger r, we can still represent S in nH0(S)
+ o(n log r) bits and answer queries in O(log r / log log n) time.
Another contribution of this paper is to show how to combine our compressed
representation of integer sequences with an existing compression
boosting technique to design compressed full-text indexes that scale
well with the size of the input alphabet A. Namely, we design a variant
of the FM-index that indexes a string T[1,n] within nHk(T) + o(n)
bits of storage, where Hk(T) is the k-th order empirical entropy
of T. This space bound holds simultaneously for all k <= a log_|A|
n, constant 0< a < 1, and |A| = O(polylog(n)). This index counts
the occurrences of an arbitrary pattern P[1,p] as a substring of
T in O(p) time; it locates each pattern occurrence in O(log^(1+e)
n) time, for any constant 0< e<1; and it reports a text substring
of length l in O(l + log^(1+e) n) time.
Compared to all previous works, our index is the first one that removes
the alphabet-size dependance from all query times, in particular
counting time is linear in the pattern length. Still, our index uses
essentially the same space of the k-th order entropy of the text
T, which is the best space obtained in previous work. We can also
handle larger alphabets of size |A|=O(n^b), for any 0< b<1, by paying
o(n log |A|) extra space and by multiplying all query times by O(log
|A| / log log n).},
url = {http://www.dcc.uchile.cl/~gnavarro/publ.html}
}
@ARTICLE{Fredriksson2004,
author = {Kimmo Fredriksson},
title = {{Metric Indexes for Approximate String Matching in a Dictionary}},
journal = {Proceedings of the 11th International Symposium on String Processing
and Information Retrieval (SPIRE’2004)},
year = {2004},
volume = {3246/2004},
pages = {212--213},
abstract = {We consider the problem of finding all approximate occurrences of
a given string q, with at most k differences, in a finite database
or dictionary of strings. The strings can be e.g. natural language
words, such as the vocabulary of some document or set of documents.
This has many important application in both off-line (indexed) and
on-line string matching. More precisely, we have a universe ${\mathbb
U}$ of strings, and a non-negative distance function $d: {\mathbb
U} \times {\mathbb U} \rightarrow {\mathbb N}$. The distance function
is metric, if it satisfies (i) $d(x,y) = 0 ~ \Leftrightarrow ~ x
= y$; (ii) d(x,y) = d(y,x); (iii) d(x,y) le d(x,z) + d(z,y). The
last item is called the ldquotriangular inequalityrdquo, and is the
most important property in our case. Many useful distance functions
are known to be metric, in particular edit (Levenshtein) distance
is metric, which we will use for d.},
doi = {10.1007/b100941},
publisher = {Springer}
}
@INPROCEEDINGS{Gilbert2000,
author = {Gilbert, D. and Schroeder, M.},
title = {FURY: fuzzy unification and resolution based on edit distance},
booktitle = {Proceedings IEEE International Symposium on Bio-Informatics and Biomedical
Engineering},
year = {2000},
number = {Nov},
pages = {330 - 336},
address = {Arlington, VA},
abstract = {The authors present a theoretically founded framework for fuzzy unification
and resolution based on edit distance over trees. Their framework
extends classical unification and resolution conservatively. They
prove important properties of the framework and develop the FURY
system, which implements the framework efficiently using dynamic
programming. The authors evaluate the framework and system on a large
problem in the bioinformatics domain, that of detecting typographical
errors in an enzyme name database},
doi = {10.1109/BIBE.2000.889625},
pdf = {D:\Publicaciones\Inproceedings\Gilber.pdf}
}
@INPROCEEDINGS{Hannenhalli1995,
author = {Sridhar Hannenhalli},
title = {Polynomial-time algorithm for computing translocation distance between
genomes},
booktitle = {Proceedings of the 6th Annual Symposium on Combinatorial Pattern
Matching},
year = {1995},
editor = {Z. Galil and E. Ukkonen},
number = {937},
pages = {162--176},
address = {Espoo, Finland},
publisher = {Springer-Verlag, Berlin},
abstract = {With the advent of large-scale DNA physical mapping and sequencing,
studies of genome rearrangements are becoming increasingly important
in evolutionary molecular biology. From a computational perspective,
the study of evolution based on rearrangements leads to a rearrangement
distance problem, i.e., computing the minimum number of rearrangement
events required to transform one genome into another. Different types
of rearrangement events give rise to a spectrum of interesting combinatorial
problems. The complexity of most of these problems is unknown. Multichromosomal
genomes frequently evolve by a rearrangement event called translocation
which exchanges genetic material between different chromosomes. In
this paper we study the translocation distance problem, modeling
the evolution of genomes evolving by translocations. The translocation
distance problem was recently studied for the first time by Kececioglu
and Ravi, who gave a 2-approximation algorithm for computing translocation
distance. In this paper we prove a duality theorem leading to a polynomial
time algorithm for computing translocation distance for the case
when the orientations of the genes are known. This leads to an algorithm
generating a most parsimonious (shortest) scenario, transforming
one genome into another by translocations.},
pdf = {D:\Publicaciones\Inproceedings\Hannenhalli S. polynomial-time algorithm for computing translocation distance between genomes.pdf},
url = {citeseer.ist.psu.edu/hannenhalli96polynomialtime.html}
}
@ARTICLE{Huang1988,
author = {Xiaoqiu Huang},
title = {A lower bound for the edit-distance problem under an arbitrary cost
function},
journal = {Inf. Process. Lett.},
year = {1988},
volume = {27},
pages = {319--321},
number = {6},
abstract = {NA},
address = {Amsterdam, The Netherlands, The Netherlands},
doi = {http://dx.doi.org/10.1016/0020-0190(88)90220-7},
issn = {0020-0190},
publisher = {Elsevier North-Holland, Inc.}
}
@ARTICLE{Hyyro2003,
author = {Heikki Hyyr\"o},
title = {A bit-vector algorithm for computing Levenshtein and Damerau edit
distances},
journal = {Nordic Journal of Computing},
year = {2003},
volume = {10},
pages = {29--39},
number = {1},
abstract = {The edit distance between strings A and B is defined as the minimum
number of edit operations needed in converting A into B or vice versa.
The Levenshtein edit distance allows three types of operations: an
insertion, a deletion or a substitution of a character. The Damerau
edit distance allows the previous three plus in addition a transposition
between two adjacent characters. To our best knowledge the best current
practical algorithms for computing these edit distances run in time
O(dm) and O(m/w(n + σ)), where d is the edit distance between the
two strings, m and n are their lengths (m ≤ n), w is the computer
word size and σ is the size of the alphabet. In this paper we present
an algorithm that runs in time O(d/wm + n/wσ) or O(d/wn + m/wσ).
The structure of the algorithm is such, that in practice it is mostly
suitable for testing whether the edit distance between two strings
is within some pre-determined error threshold. We also present some
initial test results with thresholded edit distance computation.
In them our algorithm works faster than the original algorithm of
Myers.},
address = {, Finland},
citeseerurl = {citeseer.ist.psu.edu/537930.html},
issn = {1236-6064},
pdf = {D:\Publicaciones\Inproceedings\Hyrro H. A bit-vector algorithms for computing Levenshtein and Damerau edit distances.pdf},
publisher = {Publishing Association Nordic Journal of Computing},
url = {citeseer.ist.psu.edu/537930.html}
}
@ARTICLE{Hyyro2006,
author = {H. Hyyr{\"o} and G. Navarro},
title = {Bit-Parallel Computation of Local Similarity Score Matrices with
Unitary Weights},
journal = {International Journal of Foundations of Computer Science (IJFCS)},
year = {2006},
volume = {17},
pages = {1325--1344},
number = {6},
abstract = {Local similarity computation between two sequences permits detecting
all the relevant alignments present between subsequences thereof.
A well-known dynamic programming algorithm works in time O(mn), m
and n being the lengths of the subsequences. The algorithm is rather
slow when applied over many sequence pairs. In this paper we present
the first bit-parallel computation of the score matrix, for a simplified
choice of scores. If the computer word has w bits, then the resulting
algorithm works in O(mn*log min(m,n,w) / w) time, achieving up to
8-fold speedups in practice. Some DNA comparison applications use
precisely the simplified scores we handle, and thus our algorithm
is directly applicable. In others, our method could be used as a
raw filter to discard most of the strings, so the classical algorithm
can be focused only on the substring pairs that can yield relevant
results.},
url = {http://www.dcc.uchile.cl/~gnavarro/publ.html}
}
@ARTICLE{Hyyro2005a,
author = {H. Hyyr{\"o} and G. Navarro},
title = {Bit-parallel Witnesses and their Applications to Approximate String
Matching},
journal = {Algorithmica},
year = {2005},
volume = {41},
pages = {203--231},
number = {3},
abstract = {We present a new bit-parallel technique for approximate string matching.
We build on two previous techniques. The first one, BPM [Myers, J.
of the ACM, 1999], searches for a pattern of length m in a text of
length n permitting k differences in O(ceil(m/w) n) time, where w
is the width of the computer word. The second one, ABNDM [Navarro
and Raffinot, ACM JEA, 2000], extends a sublinear-time exact algorithm
to approximate searching. ABNDM relies on another algorithm, BPA
[Wu and Manber, Comm. ACM, 1992], which makes use of an O(k ceil(m/w)
n) time algorithm for its internal workings. BPA is slow but flexible
enough to support all operations required by ABNDM. We improve previous
ABNDM analyses, showing that it is average-optimal in number of inspected
characters, although the overall complexity is higher because of
the O(k ceil(m/w)) work done per inspected character. We then show
that the faster BPM can be adapted to support all the operations
required by ABNDM. This involves extending it to compute edit distance,
to search for any pattern suffix, and to detect in advance the impossibility
of a later match. The solution to those challenges is based on the
concept of a witness, which permits sampling some dynamic programming
matrix values so as to bound, deduce, or compute others fast. The
resulting algorithm is average-optimal for m <= w, assuming the alphabet
size is constant. In practice, it performs better than the original
ABNDM and is the fastest algorithm for several combinations of m,
k and alphabet sizes that are useful, for example, in natural language
searching and computational biology. To show that the concept of
witnesses can be used in further scenarios, we also improve a recent
bit-parallel algorithm based on Myers [Fredriksson, SPIRE 2003].
The use of witnesses greatly improves the running time of this algorithm
too.},
publisher = {Springer},
url = {http://www.dcc.uchile.cl/~gnavarro/publ.html}
}
@INPROCEEDINGS{Hyyroe2005a,
author = {Heikki Hyyrö and Yoan Pinzon and Ayumi Shinohara},
title = {Fast Bit-Vector Algorithms for Approximate String Matching under
Indel-Distance},
booktitle = {Proceedings of the 31st Annual Conference on Current Trends in Theory
and Practice of Informatics (SOFSEM'05)},
year = {2005},
editor = {M. Bielikov, Charon-Bost, O. Sykora and P. Vojts},
volume = {3381},
series = {Lecture Notes in Computer Science},
pages = {380-384},
publisher = {Springer-Verlag},
abstract = {The approximate string matching problem is to find all location at
which a query p of length m matches a substring of a text t of length
n with at most k differences (insertion, deletion, and substitution).
The fastest solutions in practice for this problem are the bit-parallel
NFA simulation algorithms of Wu & Manber [4] and Baeza-Yates & Navarro
[1], and the bit-parallel dynamic programming algorithm of Myers
[3]. In this paper we present modified versions of these algorithms
to deal with the restricted case where only insertions and deletions
(called indel for short) are permitted. We also show test results
with the algorithms.},
isbn = {3-540-24302-X},
issn = {0302-9743},
pdf = {D:\Publicaciones\Inproceedings\Hyyro H. Pinzón Y. Shinohara A. Fast Bit-Vector Algorithms for Approximate String Matching under Indel-Distance.pdf},
url = {http://dis.unal.edu.co/profesores/ypinzon/ps/sofsem05.pdf}
}
@INPROCEEDINGS{Hyyroe2005,
author = {Heikki Hyyrö and Yoan Pinzon and Ayumi Shinohara},
title = {New Bit-Parallel Indel-Distance Algorithm},
booktitle = {Proceedings of the 4th International Workshop on Efficient and Experimental
Algorithms (WEA'05)},
year = {2005},
volume = {3503},
series = {Lecture Notes in Computer Science},
pages = {380-390},
publisher = {Springer-Verlag, Berlin},
abstract = {The task of approximate string matching is to find all locations at
which a pattern string p of length m matches a substring of a text
string t of length n with at most k differences. It is common to
use Levenshtein distance [5], which allows the differences to be
single-character insertions, deletions, or substitutions. Recently,
in [3], the IndelMYE, IndelWM and IndelBYN algorithms where introduced
as modified version of the bit-parallel algorithms of Myers [6],
Wu&Manber [10] and Baeza-Yates&Navarro [1], respectively. These modified
versions were made to support the indel distance (only single-character
insertions and/or deletions are allowed). In this paper we present
an improved version of IndelMYE that makes a better use of the bit-operations
and runs 24.5 percent faster in practice. In the end we present a
complete set of experimental results to support our findings.},
isbn = {3-540-25920-1},
issn = {0302-9743},
pdf = {D:\Publicaciones\Inproceedings\Heikki Hyyro. Yoan Pinzon. Ayumi Shinohara. New Bit-Parallel Indel-Distance Algorithm.pdf},
url = {http://dis.unal.edu.co/profesores/ypinzon/ps/wea05.pdf}
}
@PHDTHESIS{H'ebrard1984,
author = {J.-J. H{\'e}brard},
title = {Distances sur les mots. Application {\`a} la recherche de motifs},
school = {Universit{\'e} de Rouen},
year = {1984},
type = {Th{\`e}se de doctorat},
abstract = {Dans divers domaines, on a à définir des critères de ressemblance
entre chaînes de caractères (mots). Etude de 3 distances définies
sur les mots: une généralisation de la distance de Hamming et deux
distances fondées sur la comparaison de sous-mots. Définition de
l'automate des sous-mots d'un mot et de la factorisation en couches},
url = {http://cat.inist.fr/?aModele=afficheN&cpsidt=9693508}
}
@ARTICLE{Hebrard1986,
author = {J.-J. H{\'e}brard and M. Crochemore},
title = {Calcul de la distance par les sous-mots},
journal = {Informatique théorique et applications},
year = {1986},
volume = {20},
pages = {441--456},
number = {4},
abstract = {Reconnaissance caractère ; Chaîne caractère ; Automate état fini ;
Structure donnée ; Automate déterministe ; Distance ; Partition ;
Algorithme ; Programmation linéaire ; Sous-mot ; Plus courte sous-suite
;},
url = {http://cat.inist.fr/?aModele=afficheN&cpsidt=8246207}
}
@ARTICLE{Irving1992,
author = {Robert W. Irving and Campbell B. Fraser},
title = {Two algorithms for the longest common subsequence of three (or more)
strings},
journal = {Combinatorial Pattern Matching},
year = {1992},
pages = {214-229},
abstract = {Various algorithms have been proposed, over the years, for the longest
common subsequence problem on 2 strings (2-LCS), many of these improving,
at least for some cases, on the classical dynamic programming approach.
However, relatively little attention has been paid in the literature
to the k-LCS problem for k > 2, a problem that has interesting applications
in areas such as the multiple alignment of sequences in molecular
biology.
In this paper, we describe and analyse two algorithms with particular
reference to the 3-LCS problem, though each algorithm can be extended
to solve the k-LCS problem for general k. The first algorithm, which
can be viewed as a ldquolazyrdquo version of dynamic programming,
has time and space complexity that is O(n(n–1) 2) for 3 strings,
and O(kn(n–1) k-1) for k strings, where n is the common length of
the strings and l is the length of an LCS. The second algorithm,
which involves evaluating entries in a ldquothresholdrdquo table
in diagonal order, has time and space complexity that is O(l(n–1)2+sn)
for 3 strings, and O(kl(n–1) k –1+ksn) for k strings, where s is
the alphabet size. For simplicity, the algorithms are presented for
equal-length strings, though extension to unequal-length strings
is straightforward.
Empirical evidence is presented to show that both algorithms show
significant improvement on the basic dynamic programming approach,
and on an earlier algorithm proposed by Hsu and Du, particularly,
as would be expected, in the case where l is relatively large, with
the balance of evidence being heavily in favour of the threshold
approach.
Key words string algorithms - longest common subsequence
Supported by a postgraduate research studentship from the Science
and Engineering Research Council},
doi = {10.1007/3-540-56024-6_18}
}
@ARTICLE{Kashyap1983,
author = {Kashyap, R.L. and Oommen, B.J. },
title = {The Noisy Substring Matching Problem},
journal = {Software Engineering, IEEE Transactions on},
year = {1983},
volume = {SE-9},
pages = {365 - 370},
abstract = {Let T(U) be the set of words in the dictionary H which contains U
as a substring. The problem considered here is the estimation of
the set T(U) when U is not known, but Y, a noisy version of U is
available. The suggested set estimate S*(Y) of T(U) is a proper subset
of H such that its every element contains at least one substring
which resembles Y most according to the Levenshtein metric. The proposed
algorithm for-the computation of S*(Y) requires cubic time. The algorithm
uses the recursively computable dissimilarity measure Dk(X, Y), termed
as the kth distance between two strings X and Y which is a dissimilarity
measure between Y and a certain subset of the set of contiguous substrings
of X. Another estimate of T(U), namely SM(Y) is also suggested. The
accuracy of SM(Y) is only slightly less than that of S*(Y), but the
computation time of SM(Y) is substantially less than that of S*(Y).
Experimental results involving 1900 noisy substrings and dictionaries
which are subsets of 1023 most common English words [11] indicate
that the accuracy of the estimate S*(Y) is around 99 percent and
that of SM(Y) is about 98 percent.},
url = {http://ieeexplore.ieee.org/search/srchabstract.jsp?arnumber=1703065&isnumber=35934&punumber=32&k2dockey=1703065@ieeejrns&query=%28oommen%3Cin%3Emetadata%29&pos=14}
}
@ARTICLE{Kashyap1981,
author = {R. L. Kashyap and B. J. Oommen},
title = {An effective algorithm for string correction using generalized edit
distances {I}: description of the algorithm and its optimality},
journal = {INFO. SCI.},
year = {1981},
volume = {23},
pages = {123--142},
number = {2},
abstract = {This paper deals with the problem of estimating a transmitted string
X, from the
corresponding received string Y, which is a noisy version of X,. We
assume that Y contains*any
number of substitution, insertion, and deletion errors, and that no
two consecutive symbols of
X, were deleted in transmission. We have shown that for channels which
cause independent
errors, and whose error probabilities exceed those of noisy strings
studied in the literature [ 121,
at least 99.5% of the erroneous strings will not contain two consecutive
deletion errors. The best
estimate X * of X, is defined as that element of H which minimizes
the generalized Levenshtein
distance D( X/Y) between X and Y. Using dynamic programming principles,
an algorithm is
presented which yields X+ without computing individually the distances
between every word
of H and Y. Though this algorithm requires more memory, it can be
shown that it is, in general,
computationally less complex than all other existing algorithms which
perform the same tas},
pdf = {D:\Publicaciones\Artículos\kashyap.pdf},
url = {http://www.scs.carleton.ca/~oommen/papers/TrieBFS01Published.pdf}
}
@INPROCEEDINGS{Kececioglu1995,
author = {John D. Kececioglu and R. Ravi},
title = {Of mice and men: algorithms for evolutionary distances between genomes
with translocation},
booktitle = {SODA '95: Proceedings of the sixth annual ACM-SIAM symposium on Discrete
algorithms},
year = {1995},
pages = {604--613},
address = {Philadelphia, PA, USA},
publisher = {Society for Industrial and Applied Mathematics},
abstract = {NA},
isbn = {0-89871-349-8},
location = {San Francisco, California, United States},
url = {http://portal.acm.org/citation.cfm?id=313825&dl=ACM&coll=GUIDE#}
}
@INPROCEEDINGS{Kececioglu1994,
author = {John D. Kececioglu and David Sankoff},
title = {Efficient Bounds for Oriented Chromosome Inversion Distance},
booktitle = {CPM '94: Proceedings of the 5th Annual Symposium on Combinatorial
Pattern Matching},
year = {1994},
pages = {307--325},
address = {London, UK},
publisher = {Springer-Verlag},
abstract = {...two circular chromosomes that have...by chromosome inversion, assuming
that...that tight bounds on the...simple and efficient algorithms},
doi = {10.1007/3-540-58094-8_26},
isbn = {3-540-58094-8}
}
@INPROCEEDINGS{Kececioglu1993,
author = {John D. Kececioglu and David Sankoff},
title = {Exact and Approximation Algorithms for the Inversion Distance Between
Two Chromosomes},
booktitle = {CPM '93: Proceedings of the 4th Annual Symposium on Combinatorial
Pattern Matching},
year = {1993},
pages = {87--105},
address = {London, UK},
publisher = {Springer-Verlag},
abstract = {Motivated by the problem in computational...the series of chromosome
inversions by which one...},
doi = {10.1007/BFb0029799},
isbn = {3-540-56764-X}
}
@INPROCEEDINGS{Kim2000,
author = {Sung-Ryul Kim and Kunsoo Park},
title = {A Dynamic Edit Distance Table},
booktitle = {COM '00: Proceedings of the 11th Annual Symposium on Combinatorial
Pattern Matching},
year = {2000},
pages = {60--68},
address = {London, UK},
publisher = {Springer-Verlag},
abstract = {...of the edit distance problem: given a solution to...the D-table,
which leads...},
isbn = {3-540-67633-3},
url = {http://www.sinab.unal.edu.co:2090/content/?k=A+Dynamic+Edit+Distance+Table}
}
@INPROCEEDINGS{740125,
author = {Philip N. Klein},
title = {Computing the Edit-Distance between Unrooted Ordered Trees},
booktitle = {ESA '98: Proceedings of the 6th Annual European Symposium on Algorithms},
year = {1998},
pages = {91--102},
address = {London, UK},
publisher = {Springer-Verlag},
abstract = {An ordered tree is a...think of the tree as...trees. The edit distance
between A and...},
isbn = {3-540-64848-8},
url = {http://www.sinab.unal.edu.co:2090/content/?k=Computing+the+Edit-Distance+between+Unrooted+Ordered+Trees}
}
@INBOOK{Knight1992,
pages = {67-78},
title = {Approximate regular expression pattern matching with concave gap
penalties},
year = {1992},
author = {James R. Knight and Eugene W. Myers},
volume = {644/1992},
abstract = {Given a sequence A of length M and a regular expression R of length
P, an approximate regular expression pattern matching algorithm computes
the score of the best alignment between A and one of the sequences
exactly matched by R. There are a variety of schemes for scoring
alignments. In a concave gap-penalty scoring scheme, a function delta(a,
b) gives the score of each aligned pair of symbols a and b, and a
concave function w(k) gives the score of a sequence of unaligned
symbols, or gap, of length k. A function w is concave if and only
if it has the property that for all k > 1, w(k+ 1)-w(k)