diff options
author | yuta.256 <yuta.256@b7c3aa3b-274f-0410-ae0b-edc9d07c929d> | 2008-06-17 20:25:18 +0000 |
---|---|---|
committer | yuta.256 <yuta.256@b7c3aa3b-274f-0410-ae0b-edc9d07c929d> | 2008-06-17 20:25:18 +0000 |
commit | 18d7a90ad01a043f61d8af8b997fdeac10772cef (patch) | |
tree | d893b595fb41b967ef3f69e07979f359bd566d8c | |
download | platform_external_libdivsufsort-18d7a90ad01a043f61d8af8b997fdeac10772cef.tar.gz platform_external_libdivsufsort-18d7a90ad01a043f61d8af8b997fdeac10772cef.tar.bz2 platform_external_libdivsufsort-18d7a90ad01a043f61d8af8b997fdeac10772cef.zip |
Initial import.
-rw-r--r-- | AUTHORS | 3 | ||||
-rw-r--r-- | COPYING | 22 | ||||
-rw-r--r-- | ChangeLog | 175 | ||||
-rw-r--r-- | ChangeLog.old | 67 | ||||
-rw-r--r-- | INSTALL | 51 | ||||
-rw-r--r-- | Makefile.am | 5 | ||||
-rw-r--r-- | NEWS | 36 | ||||
-rw-r--r-- | README | 155 | ||||
-rw-r--r-- | configure.ac | 72 | ||||
-rw-r--r-- | examples/Makefile.am | 6 | ||||
-rw-r--r-- | examples/bwt.c | 152 | ||||
-rw-r--r-- | examples/mksary.c | 132 | ||||
-rw-r--r-- | examples/sasearch.c | 402 | ||||
-rw-r--r-- | examples/suftest.c | 133 | ||||
-rw-r--r-- | examples/unbwt.c | 121 | ||||
-rw-r--r-- | include/Makefile.am | 4 | ||||
-rw-r--r-- | include/divsufsort.h.in | 132 | ||||
-rw-r--r-- | include/divsufsort_private.h.in | 162 | ||||
-rw-r--r-- | lib/Makefile.am | 9 | ||||
-rw-r--r-- | lib/divsufsort.c | 362 | ||||
-rw-r--r-- | lib/libdivsufsort.sym | 8 | ||||
-rw-r--r-- | lib/substringsort.c | 619 | ||||
-rw-r--r-- | lib/trsort.c | 684 | ||||
-rw-r--r-- | lib/utils.c | 378 |
24 files changed, 3890 insertions, 0 deletions
@@ -0,0 +1,3 @@ +-- AUTHORS for libdivsufsort + +Yuta Mori <yiv01157@nifty.com> @@ -0,0 +1,22 @@ +Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. + +Permission is hereby granted, free of charge, to any person +obtaining a copy of this software and associated documentation +files (the "Software"), to deal in the Software without +restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. diff --git a/ChangeLog b/ChangeLog new file mode 100644 index 0000000..1b89304 --- /dev/null +++ b/ChangeLog @@ -0,0 +1,175 @@ +2008-02-23 Yuta Mori <yiv01157@nifty.com> + + * lib/substringsort.c (_merge_backward): Bug fix. + * lib/trsort.c (_tr_introsort): Bug fix. + +2007-09-02 Yuta Mori <yiv01157@nifty.com> + + * lib/trsort.c (_ls_introsort): Important bug fix. + +2007-07-15 Yuta Mori <yiv01157@nifty.com> + + A few bug fixes. + + * lib/divsufsort.c (divbwt): Bug fix. + * lib/trsort.c (_tr_introsort): Bug fix. + * lib/utils.c (sa_search, sa_simplesearch): New functions. + * lib/libdivsufsort.sym: Update. + * include/divsufsort.h.in: Update. + * examples/sasearch.c: New file. + * examples/Makefile.am: Update. + * configure.ac: Update. + * NEWS: Update. + * README: Update. + +2007-04-14 Yuta Mori <yiv01157@nifty.com> + + Change license to the MIT/X11 license. + Update all files for 1.2.0. + + * lib/libdivsufsort.sym: New file for libtool. + +2007-04-07 Yuta Mori <yiv01157@nifty.com> + + Update files for 1.1.7. + +2007-04-07 Yuta Mori <yiv01157@nifty.com> + + Replace drsort with tandem repeat sorting algorithm and Larsson-Sadakane sorting algorithm. + + * lib/trsort.c: New file. + * lib/drsort.c: Delete. + * lib/divsufsort.c: Update. + * lib/Makefile.am: Update. + * lib/divsufsort_private.h.in (LS_INSERTIONSORT_THRESHOLD, TR_INSERTIONSORT_THRESHOLD): New constants. + (DR_INSERTIONSORT_THRESHOLD): Delete. + (STACK_PUSH3, STACK_POP3): New macros. + +2007-03-31 Yuta Mori <yiv01157@nifty.com> + + Update files for 1.1.6. + +2007-03-31 Yuta Mori <yiv01157@nifty.com> + + Replace _ss_merge with new merge algorithms. + + * lib/substringsort.c (_ss_merge): Delete. + * lib/substringsort.c (_block_swap, _merge_forward, _merge_backward, _merge): New functions. + (substringsort): Update. + * lib/divsufsort.c (_sort_typeBstar, divsufsort, divbwt): Update. + * include/divsufsort_private.h.in (LOCALMERGE_BUFFERSIZE): New constant. + (SS_MERGESORT_QUEUESIZE): Delete. + +2007-03-24 Yuta Mori <yiv01157@nifty.com> + + Update files for 1.1.5. + +2007-03-23 Yuta Mori <yiv01157@nifty.com> + + Replace breadth-first introsort with new multikey introsort. + + * lib/substringsort.c (_compare): Update. + (_substring_partition): Update. + (_multikey_introsort): New function. + (_introsort, _bfintrosort): Delete. + (substringsort): Update. + * lib/divsufsort.c (_sort_typeBstar): Update. + +2007-03-21 Yuta Mori <yiv01157@nifty.com> + + * lib/substringsort.c (_introsort): Convert introsort to a non-recursive algorithm. + (substringsort): Update. + * lib/divsufsort.c (_sort_typeBstar): Update. + +2007-03-21 Yuta Mori <yiv01157@nifty.com> + + * include/divsufsort_private.h.in (STACK_SIZE): Rename from SS_STACK_SIZE. + (SS_BLOCKSIZE): Rename from SS_MKQSORT_THRESHOLD. + (SS_MKQSORT_DMAX, SS_DSWAP, SS_STACK_PUSH, SS_STACK_POP): Delete. + (STACK_PUSH, STACK_POP): New macros. + (substringsort): Update prototype. + +2007-03-17 Yuta Mori <yiv01157@nifty.com> + + Update files for 1.1.4. + +2007-03-17 Yuta Mori <yiv01157@nifty.com> + + * substringsort.c (_fixdown, _heapsort, _lg): New function. + (_introsort): Rename from _quicksort. Change to use new partitioning algorithm. + (_bfintrosort): Rename from _bfquicksort. + +2007-03-10 Yuta Mori <yiv01157@nifty.com> + + Update files for 1.1.3. + +2007-03-10 Yuta Mori <yiv01157@nifty.com> + + Replace depth-first multikey quicksort with new breadth-first ternary quicksort. + + * substringsort.c (_ss_compare_lcp, _ss_tqsort, _ss_mkqsort): Remove. + (_median3): Rename from _ss_median and rewrite. + (_pivot): Rename from _ss_pivot and rewrite. + (_median5, _substring_partition, _quicksort, _bfquicksort): New function. + +2007-03-03 Yuta Mori <yiv01157@nifty.com> + + Update files for 1.1.2. + +2007-03-03 Yuta Mori <yiv01157@nifty.com> + + * substringsort.c (_compare): Rename from _ss_compare and rewrite. + (_insertionsort): Rename from _ss_insertionsort and rewrite. + +2007-02-24 Yuta Mori <yiv01157@nifty.com> + + Update files for 1.1.1. + +2007-02-24 Yuta Mori <yiv01157@nifty.com> + + * lib/substringsort.c (_ss_getc): Remove. + +2007-02-17 Yuta Mori <yiv01157@nifty.com> + + Update files for 1.1.0. + +2007-02-17 Yuta Mori <yiv01157@nifty.com> + + * utils.c (bwtcheck): Remove. + +2007-02-11 Yuta Mori <yiv01157@nifty.com> + + * lib/divsufsort.c, + include/divsufsort.h.in, + include/divsufsort_private.h.in: + Change to use a new improved two-stage sort algorithm (version 070210). + +2007-01-28 Yuta Mori <yiv01157@nifty.com> + + * lib/divsufsort.c (_sort): Fix a bug that using wrong index. + +2007-01-28 Yuta Mori <yiv01157@nifty.com> + + * examples/bwt.c: Rename from examples/bwt2.c. + * examples/unbwt.c: Rename from examples/unbwt2.c. + * examples/bwt1.c: Delete. + * examples/unbwt1.c: Delete. + +2007-01-28 Yuta Mori <yiv01157@nifty.com> + + * lib/divsufsort.c, include/divsufsort_private.h.in: + Change to use new improved two-stage sort algorithm (version 070128). + +2007-01-24 Yuta Mori <yiv01157@nifty.com> + + Remove use of libtool. + + * include/divsufsort_private.h.in: Rename from include/divsufsort_private.h. + +2007-01-24 Yuta Mori <yiv01157@nifty.com> + + Initial import. + +;; Local Variables: +;; coding: utf-8 +;; End: diff --git a/ChangeLog.old b/ChangeLog.old new file mode 100644 index 0000000..df8de8e --- /dev/null +++ b/ChangeLog.old @@ -0,0 +1,67 @@ +Version 1.0.2 (2006-01-01): + + * Release 1.0.2 + +Version 1.0.2b (2005-12-11): + + * lib/divsufsort.c (_construct_typeBstar): Completely rewrite. + +Version 1.0.2a (2005-12-04): + + * lib/substringsort.c: Completely rewrite. + * lib/drsort.c: Completely rewrite. + * lib/divsufsort.c (_sort_typeBstar): Fix some bugs. + +Version 1.0.1 (2005-11-08): + + * Release 1.0.1 + +Version 1.0.1a (2005-11-06): + + * configure.ac: Add AM_ENABLE_STATIC, AM_DISABLE_SHARED and AC_LIBTOOL_WIN32_DLL + + * Makefile.am (EXTRA_DIST): Add ChangeLog.old. + + * lib/divsufsort.c (_construct_typeBstar): Fix. + + * AUTHORS: New file. + * ChangeLog: New file. + * ChangeLog.old: New file. + * INSTALL: New file. + * NEWS: New file. + +Version 1.0.0 (2005-10-31) + * Introduced autoconf and automake. + * Added new example programs. + +Version 0.2.1 (2005-08-27) + * divsufsort.c: Kao's algorithm was replaced with Improved Two-Stage algorithm. + * divsufsort.c: Reduced memory usage. + * substringsort.c: Added mergesort for sorting large groups of suffixes. + +Version 0.1.6 (2005-06-10) + * divsufsort.h: Renamed from libdivsufsort.h. (again...) + * divsufsort.c: Renamed from libdivsufsort.c. (again...) + * divsufsort.c: Reduced memory usage. + * substringsort.c, substringsort.h, drsort.c, drsort.h: Modify. + * mksary_mmap/makefile, mksary_mmap/mksary.c, + mksary_mmap/mmap.c, mksary_mmap/mmap.h: Removed. + +Version 0.1.5 (2005-04-07) + * libdivsufsort.c: ranksort and doublingsort were replaced with drsort. + * def.h, drsort.c, drsort.h: New file. + * doublingsort.c, doublingsort.h, ranksort.c, ranksort.h: Removed. + +Version 0.1.4 (2005-03-27) + * mksary/mksary.c, mksary_mmap/mksary.c, suftest.c: Added error handling. + +Version 0.1.3 (2005-01-28) + * mksary/makefile, mksary/mksary.c, mksary_mmap/makefile, + mksary_mmap/mksary.c, mksary_mmap/mmap.c, mksary_mmap/mmap.h: New file. + * libdivsufsort.c: Modify. + +Version 0.1.2 (2005-01-01) + * suftest.c: New file. + * libdivsufsort.c: Renamed from divsufsort.c. + * libdivsufsort.h: Renamed from divsufsort.h. + @@ -0,0 +1,51 @@ +-- INSTALL for libdivsufsort + +Compilation and Installation: +----------------------------- + + The `configure' shell script attempts to guess correct values for +various system-dependent variables used during compilation. It uses +those values to create a `Makefile' in each directory in the +libdivsufsort source tree. Finally, it creates a shell script +`config.status' that you can run in the future to recreate the +current configuration, and a file `config.log' containing compiler +output (useful mainly for debugging `configure'). + + The file `configure.ac' is used to create `configure' by a program +called `autoconf'. You only need `configure.ac' if you want to change +it or regenerate `configure' using a newer version of `autoconf'. + +The simplest way to compile this package is: + + 1. `cd' to the directory containing the package's source code and type + `./configure' to configure the package for your system. If you're + using `csh' on an old version of System V, you might need to type + `sh ./configure' instead to prevent `csh' from trying to execute + `configure' itself. + + Running `configure' takes awhile. While running, it prints some + messages telling which features it is checking for. + + 2. Type `make' to compile the package. + + 3. Type `make install' to install the library and header files. + + 4. You can remove the program binaries and object files from the + source code directory by typing `make clean'. To also remove the + files that `configure' created (so you can compile the package for + a different kind of computer), type `make distclean'. + + +Compilers and Options: +---------------------- + + Some systems require unusual options for compilation or linking +that the `configure' script does not know about. You can give +`configure' initial values for configuration parameters by setting +variables in the command line or in the environment. Here +is an example: + + ./configure CC=c89 CFLAGS=-O2 LIBS=-lposix + +Run `./configure --help' for details on some of the pertinent +environment variables. diff --git a/Makefile.am b/Makefile.am new file mode 100644 index 0000000..99af5e2 --- /dev/null +++ b/Makefile.am @@ -0,0 +1,5 @@ +# Makefile.am for libdivsufsort + +SUBDIRS = include lib examples + +EXTRA_DIST = ChangeLog.old @@ -0,0 +1,36 @@ +-- NEWS for libdivsufsort + +Changes in release 1.2.3 + + - Bug fixes. + +Changes in release 1.2.2 + + - An important bug fix. + +Changes in release 1.2.1 + + - A few bug fixes. + - New APIs: sa_search, sa_simplesearch. + +Changes in release 1.2.0 + + - Changed license to the MIT/X11 license, see COPYING. + - Improved the performance of the suffix array construction. + - Change to use a new improved two-stage sorting algorithm. + - Replace substringsort with a new sorting algorithm that uses + multikey introspective sorting algorithm and in-place/recursive + merging algorithm. + - Replace drsort with a new sorting algorithm that uses + multikey introspective sorting algorithm, Maniscalco's tandem + repeat sorting algorithm and Larsson-Sadakane sorting algorithm. + - New API: divbwt. + +Changes in release 1.0.2 + + - The performance of sorting has been improved. + - Fix some bugs. + +Changes in release 1.0.1 + + - The performance of sorting has been improved a little bit. @@ -0,0 +1,155 @@ +libdivsufsort - A lightweight suffix-sorting library. +----------------------------------------------------- + +Introduction: +------------- + +libdivsufsort is an open-source library that provides a high +performance and lightweight suffix-sorting and BWT-construction +algorithm. For input size n, this algorithm runs in O(n log n) +worst-case (and average) time using only 5n bytes of memory. + +The latest version of libdivsufsort is available at: + http://homepage3.nifty.com/wpage/software/libdivsufsort.html + + +License: +-------- + +libdivsufsort is released under the MIT/X11 license. See the file +COPYING for more details. + + +APIs: +----- + + * Data types + typedef int32_t saint_t; + typedef int32_t saidx_t; + typedef uint8_t sauchar_t; + + * Constructs the suffix array of a given string. + * @param T[0..n-1] The input string. + * @param SA[0..n] The output array or suffixes. + * @param n The length of the given string. + * @return 0 if no error occurred, -1 or -2 otherwise. + saint_t + divsufsort(const sauchar_t *T, saidx_t *SA, saidx_t n); + + * Constructs the burrows-wheeler transformed string of a given string. + * @param T[0..n-1] The input string. + * @param U[0..n-1] The output string. (can be T) + * @param A[0..n] The temporary array. (can be NULL) + * @param n The length of the given string. + * @return The primary index if no error occurred, -1 or -2 otherwise. + saidx_t + divbwt(const sauchar_t *T, sauchar_t *U, saidx_t *A, saidx_t n); + + * Constructs the burrows-wheeler transformed string of a given string and suffix array. + * @param T[0..n-1] The input string. + * @param U[0..n-1] The output string. (can be T) + * @param SA[0..n] The suffix array. (can be NULL) + * @param n The length of the given string. + * @param idx The output primary index. + * @return 0 if no error occurred, -1 or -2 otherwise. + saint_t + bw_transform(const sauchar_t *T, sauchar_t *U, saidx_t *SA, + saidx_t n, saidx_t *idx); + + * Inverse BW-transforms a given BWTed string. + * @param T[0..n-1] The input string. + * @param U[0..n-1] The output string. (can be T) + * @param A[0..n] The temporary array. (can be NULL) + * @param n The length of the given string. + * @param idx The primary index. + * @return 0 if no error occurred, -1 or -2 otherwise. + saint_t + inverse_bw_transform(const sauchar_t *T, sauchar_t *U, + saidx_t *A, saidx_t n, saidx_t idx); + + * Checks the correctness of a given suffix array. + * @param T[0..n-1] The input string. + * @param SA[0..n] The input suffix array. + * @param n The length of the given string. + * @param verbose The verbose mode. + * @return 0 if no error occurred. + saint_t + sufcheck(const sauchar_t *T, const saidx_t *SA, + saidx_t n, saint_t verbose); + + * Search for the pattern P in the string T. + * @param T[0..Tsize-1] The input string. + * @param Tsize The length of the given string. + * @param P[0..Psize-1] The input pattern string. + * @param Psize The length of the given pattern string. + * @param SA[0..SAsize-1] The input suffix array. + * @param SAsize The length of the given suffix array. + * @param idx The output index. + * @return The count of matches if no error occurred, -1 otherwise. + saidx_t + sa_search(const sauchar_t *T, saidx_t Tsize, + const sauchar_t *P, saidx_t Psize, + const saidx_t *SA, saidx_t SAsize, + saidx_t *idx); + + * Search for the character c in the string T. + * @param T[0..Tsize-1] The input string. + * @param Tsize The length of the given string. + * @param SA[0..SAsize-1] The input suffix array. + * @param SAsize The length of the given suffix array. + * @param c The input character. + * @param idx The output index. + * @return The count of matches if no error occurred, -1 otherwise. + saidx_t + sa_simplesearch(const sauchar_t *T, saidx_t Tsize, + const saidx_t *SA, saidx_t SAsize, + saint_t c, saidx_t *idx); + + * Returns the version string of libdivsufsort. + * @return the version string. + const char * + divsufsort_version(void); + + +Benchmark results: +------------------ + +http://homepage3.nifty.com/wpage/benchmark/index.html + + +Algorithm: +---------- + +libdivsufsort uses the following algorithms for suffix sorting. + - The improved version of Itho-Tanaka two-stage sorting algorithm. [2][6] + - A substring sorting/encoding technique. [1][3] + - Maniscalco's tandem repeat sorting algorithm. [5] + - Larsson-Sadakane sorting algorithm. [4] + + +References: +----------- + + 1. Stefan Burkhardt and Juha K"arkk"ainen. Fast lightweight suffix + array construction and checking. Proceedings of the 14th Annual + Symposium on Combinatorial Pattern Matching, LNCS 2676, + Springer, pp. 55-69, 2003. + + 2. Hideo Itoh and Hozumi Tanaka, An Efficient Method for in Memory + Construction of Suffix Arrays, Proceedings of the IEEE String + Processing and Information Retrieval Symposium, pp. 81-88, 1999. + + 3. Pang Ko and Srinivas Aluru, Space-efficient linear time + construction of suffix arrays, Proceedings of the 14th Annual + Symposium on Combinatorial Pattern Matching, pp. 200-210, 2003. + + 4. Jesper Larsson and Kunihiko Sadakane, Faster suffix sorting. + Technical report LU-CS-TR:99-214, Department of Computer + Science, Lund University, Sweden, 1999. + + 5. Michael Maniscalco, MSufSort. + http://www.michael-maniscalco.com/msufsort.htm + + 6. Yuta Mori, Short description of improved two-stage suffix sorting + algorithm, 2005. + http://homepage3.nifty.com/wpage/software/itssort.txt diff --git a/configure.ac b/configure.ac new file mode 100644 index 0000000..ed090e5 --- /dev/null +++ b/configure.ac @@ -0,0 +1,72 @@ +# configure.ac for libdivsufsort + +AC_PREREQ(2.59) + +AC_INIT([libdivsufsort], [1.2.3], [yiv01157@nifty.com]) +AC_CONFIG_SRCDIR([include/divsufsort.h.in]) +AC_CONFIG_HEADER([include/config.h]) +AC_CONFIG_AUX_DIR([config]) +AM_INIT_AUTOMAKE([-Wall -Werror foreign 1.9 no-define dist-bzip2]) + +# Library code modified: REVISION++ +# Interfaces changed/added/removed: CURRENT++ REVISION=0 +# Interfaces added: AGE++ +# Interfaces removed: AGE=0 +AC_SUBST(LT_CURRENT, 3) +AC_SUBST(LT_REVISION, 2) +AC_SUBST(LT_AGE, 1) + +# Checks for programs. +AC_PROG_CC +AC_PROG_INSTALL +AC_PROG_MAKE_SET + +# Check for host type. +AC_CANONICAL_HOST + +# Checks for compiler output filename suffixes. +AC_OBJEXT +AC_EXEEXT + +# Check for build configuration. +AM_DISABLE_STATIC +AC_LIBTOOL_WIN32_DLL +AC_PROG_LIBTOOL +case "$host_os" in + cygwin*) + AC_ARG_ENABLE(cygwin, AC_HELP_STRING([--disable-cygwin], [Don't use the CygWin libraries])) + if test "$enable_cygwin" != "no"; then + A_CFLAGS='' + else + A_CFLAGS='-mno-cygwin' + fi + LT_NOUNDEF='-no-undefined' + ;; + *) LT_NOUNDEF='' A_CFLAGS='' ;; +esac +AC_SUBST([LT_NOUNDEF]) +AC_SUBST([A_CFLAGS]) + +# Checks for libraries. + +# Checks for header files. +AC_HEADER_STDC +AC_CHECK_HEADERS([inttypes.h memory.h stddef.h stdint.h stdlib.h string.h strings.h]) +AC_CHECK_HEADER([inttypes.h], [AC_SUBST([_HAVE_INTTYPES_H_], [1])], [AC_SUBST([_HAVE_INTTYPES_H_], [0])]) +AC_CHECK_HEADER([stdint.h], [AC_SUBST([_HAVE_STDINT_H_], [1])], [AC_SUBST([_HAVE_STDINT_H_], [0])]) + +# Checks for typedefs, structures, and compiler characteristics. +AC_C_CONST +AC_C_INLINE + +# Checks for library functions. +AC_FUNC_MALLOC +AC_FUNC_STAT + +AC_CONFIG_FILES([Makefile + include/Makefile + include/divsufsort.h + include/divsufsort_private.h + lib/Makefile + examples/Makefile]) +AC_OUTPUT diff --git a/examples/Makefile.am b/examples/Makefile.am new file mode 100644 index 0000000..f098500 --- /dev/null +++ b/examples/Makefile.am @@ -0,0 +1,6 @@ +# Makefile.am for libdivsufsort + +noinst_PROGRAMS = suftest mksary sasearch bwt unbwt +LDADD = $(top_builddir)/lib/libdivsufsort.la + +AM_CFLAGS = @A_CFLAGS@ diff --git a/examples/bwt.c b/examples/bwt.c new file mode 100644 index 0000000..41591f8 --- /dev/null +++ b/examples/bwt.c @@ -0,0 +1,152 @@ +/* + * bwt.c for libdivsufsort + * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif +#include <divsufsort.h> +#include <stdio.h> +#if HAVE_STDLIB_H +# include <stdlib.h> +#endif +#if HAVE_STRING_H +# if !STDC_HEADERS && HAVE_MEMORY_H +# include <memory.h> +# endif +# include <string.h> +#endif +#if HAVE_STRINGS_H +# include <strings.h> +#endif +#include <time.h> + + +static +saidx_t +_str2size(const char *str) { + saidx_t s[3]; + saidx_t t; + int i, c; + for(i = 0, t = s[0] = s[1] = s[2] = 0; (c = str[i]) != '\0'; ++i) { + if(('0' <= c) && (c <= '9')) { + t = (t * 10) + (c - '0'); + } else { + switch(c) { + case 'm': + case 'M': + s[0] += t << 20; + break; + case 'k': + case 'K': + s[1] += t << 10; + break; + case 'b': + case 'B': + s[2] += t; + break; + } + t = 0; + } + } + return s[0] + s[1] + s[2]; +} + +int +main(int argc, const char *argv[]) { + sauchar_t *T; + saidx_t *SA; + saidx_t m, n, blocksize, idx; + clock_t start,finish; + + /* Check argument. */ + if(((argc != 1) && (argc != 2)) || + ((argc == 2) && (strcmp(argv[1], "-h") == 0)) || + ((argc == 2) && (strcmp(argv[1], "--help") == 0))) { + fprintf(stderr, + "bwt, a burrows-wheeler transform program, version %s.\n" + , divsufsort_version()); + fprintf(stderr, + "usage: %s [BLOCKSIZE] < STDIN > STDOUT\n\n" + , argv[0]); + exit(EXIT_FAILURE); + } + blocksize = (argc == 2) ? _str2size(argv[1]) : 0; + if(blocksize <= 0) { + fseek(stdin, 0, SEEK_END); + blocksize = ftell(stdin); + if(blocksize < 0) { blocksize = BUFSIZ; } + rewind(stdin); + } + + /* Allocate blocksize+4*(blocksize+1) bytes of memory. */ + if(((T = malloc(blocksize * sizeof(sauchar_t))) == NULL) || + ((SA = malloc((blocksize + 1) * sizeof(saidx_t))) == NULL)) { + fprintf(stderr, "%s: Cannot allocate memory.\n", argv[0]); + exit(EXIT_FAILURE); + } + + /* Write the blocksize. */ + if(fwrite(&blocksize, sizeof(saidx_t), 1, stdout) != 1) { + fprintf(stderr, "%s: Cannot write to `stdout': ", argv[0]); + perror(NULL); + exit(EXIT_FAILURE); + } + + fprintf(stderr, " BWT (blocksize %d) ... ", (int)blocksize); + start=clock(); + for(n = 0; 0 < (m = fread(T, sizeof(sauchar_t), blocksize, stdin)); n += m) { + /* Burrows-Wheeler Transform. */ + idx = divbwt(T, T, SA, m); + if(idx < 0) { + fprintf(stderr, "%s (bw_transform): %s.\n", + argv[0], + (idx == -1) ? "Invalid arguments" : "Cannot allocate memory"); + exit(EXIT_FAILURE); + } + + /* Write the bwted data. */ + if((fwrite(&idx, sizeof(saidx_t), 1, stdout) != 1) || + (fwrite(T, sizeof(sauchar_t), m, stdout) != m)) { + fprintf(stderr, "%s: Cannot write to `stdout': ", argv[0]); + perror(NULL); + exit(EXIT_FAILURE); + } + } + if(ferror(stdin)) { + fprintf(stderr, "%s: Cannot read from `stdin': ", argv[0]); + perror(NULL); + exit(EXIT_FAILURE); + } + finish = clock(); + fprintf(stderr, "%d bytes: %.4f sec\n", + (int)n, (double)(finish - start) / (double)CLOCKS_PER_SEC); + + /* Deallocate memory. */ + free(T); + free(SA); + + return 0; +} diff --git a/examples/mksary.c b/examples/mksary.c new file mode 100644 index 0000000..a548582 --- /dev/null +++ b/examples/mksary.c @@ -0,0 +1,132 @@ +/* + * mksary.c for libdivsufsort + * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif +#include <divsufsort.h> +#include <stdio.h> +#if HAVE_STDLIB_H +# include <stdlib.h> +#endif +#include <time.h> +#if HAVE_SYS_STAT_H +# include <sys/stat.h> +#endif + + +int +main(int argc, const char *argv[]) { + FILE *ifp, *ofp; + sauchar_t *T; + saidx_t *SA; + saidx_t n; + clock_t start, finish; +#if HAVE_SYS_STAT_H + struct stat s; +#endif + + /* Check argument. */ + if(argc != 3) { + fprintf(stderr, + "mksary, a simple suffix array builder, version %s.\n" + , divsufsort_version()); + fprintf(stderr, + "usage: %s srcFILE dstSA\n\n" + , argv[0]); + exit(EXIT_FAILURE); + } + + /* Get a file's status information. */ +#if HAVE_SYS_STAT_H + if(stat(argv[1], &s) != 0) { + fprintf(stderr, "%s: Cannot stat file `%s': ", argv[0], argv[1]); + perror(NULL); + exit(EXIT_FAILURE); + } + n = s.st_size; +#endif + + /* Open a file for reading. */ + if((ifp = fopen(argv[1], "rb")) == NULL) { + fprintf(stderr, "%s: Cannot open file `%s': ", argv[0], argv[1]); + perror(NULL); + exit(EXIT_FAILURE); + } + + /* Open a file for writing. */ + if((ofp = fopen(argv[2], "wb")) == NULL) { + fprintf(stderr, "%s: Cannot open file `%s': ", argv[0], argv[2]); + perror(NULL); + exit(EXIT_FAILURE); + } + +#if !HAVE_SYS_STAT_H + fseek(ifp, 0, SEEK_END); + n = ftell(ifp); + rewind(ifp); +#endif + + /* Allocate n+4(n+1) bytes of memory. */ + if(((T = malloc(n * sizeof(sauchar_t))) == NULL) || + ((SA = malloc((n + 1) * sizeof(saidx_t))) == NULL)) { + fprintf(stderr, "%s: Cannot allocate memory.\n", argv[0]); + exit(EXIT_FAILURE); + } + + /* Read n bytes of data. */ + if(fread(T, sizeof(sauchar_t), n, ifp) != n) { + fprintf(stderr, "%s: %s `%s': ", + argv[0], + (ferror(ifp) || !feof(ifp)) ? "Cannot read from" : "Unexpected EOF in", + argv[1]); + perror(NULL); + exit(EXIT_FAILURE); + } + fclose(ifp); + + /* Construct the suffix array. */ + fprintf(stderr, "%s: %d bytes ... ", argv[1], (int)n); + start = clock(); + divsufsort(T, SA, n); + finish = clock(); + fprintf(stderr, "%.4f sec\n", + (double)(finish - start) / (double)CLOCKS_PER_SEC); + + /* Write the suffix array. */ + if(fwrite(SA, sizeof(saidx_t), n + 1, ofp) != (n + 1)) { + fprintf(stderr, "%s: Cannot write to `%s': ", argv[0], argv[2]); + perror(NULL); + exit(EXIT_FAILURE); + } + fclose(ofp); + + /* Deallocate memory. */ + free(SA); + free(T); + + return 0; +} diff --git a/examples/sasearch.c b/examples/sasearch.c new file mode 100644 index 0000000..63a3932 --- /dev/null +++ b/examples/sasearch.c @@ -0,0 +1,402 @@ +/* + * sasearch.c for libdivsufsort + * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif +#include <divsufsort.h> +#include <stdio.h> +#if HAVE_STDLIB_H +# include <stdlib.h> +#endif +#include <string.h> +#if HAVE_STRING_H +# if !STDC_HEADERS && HAVE_MEMORY_H +# include <memory.h> +# endif +# include <string.h> +#endif +#if HAVE_STRINGS_H +# include <strings.h> +#endif +#if HAVE_SYS_STAT_H +# include <sys/stat.h> +#endif + + +#define SA_SORT_LEXICOGRAPHICALORDER (1 << 0) +#define SA_PRINT_OFFSET (1 << 1) +#define SA_PRINT_FILENAME (1 << 2) +#define SA_HEX_MODE (1 << 3) + +typedef struct _searchoption_t searchoption_t; +struct _searchoption_t { + const char *fname; + saidx_t maxcount; + saidx_t blen, alen; + unsigned int flags; + void(*func)(const sauchar_t *T, saidx_t Tsize, + const sauchar_t *P, saidx_t Psize, + const saidx_t *SA, saidx_t SAsize, + saidx_t left, saidx_t size, searchoption_t *opt); +}; + +static +void +_print_suffix(const sauchar_t *T, saidx_t Tsize, saidx_t Psize, saidx_t pos, + const searchoption_t *option) { + saidx_t i; + saidx_t a, b, c, d; + + a = (option->blen < pos) ? pos - option->blen : 0; + b = pos; + c = pos + Psize; + d = ((c + option->alen) < Tsize) ? c + option->alen : Tsize; + + if(option->flags & SA_PRINT_FILENAME) { printf("%s:", option->fname); } + if(option->flags & SA_PRINT_OFFSET) { printf("%d:", a); } + + if(option->flags & SA_HEX_MODE) { + for(i = a; i < (d - 1); ++i) { + printf("%02x ", T[i]); + } + printf("%02x\n", T[d - 1]); + } else { + for(i = a; i < d; ++i) { + switch(T[i]) { + case '\n': printf("[\\n]"); break; + case '\r': printf("[\\r]"); break; + default: printf("%c", T[i]); + } + } + printf("\n"); + } +} + +static +void +_onlyprint_count(const sauchar_t *T, saidx_t Tsize, + const sauchar_t *P, saidx_t Psize, + const saidx_t *SA, saidx_t SAsize, + saidx_t left, saidx_t size, searchoption_t *option) { + if(0 < size) { + if(option->flags & SA_PRINT_FILENAME) { printf("%s:", option->fname); } + printf("%d\n", size); + } +} + +static +int +_idx_cmp(const void *p1, const void *p2) { + saidx_t i1 = *((saidx_t *)p1), i2 = *((saidx_t *)p2); + if(i1 < i2) { return -1; } + if(i1 > i2) { return 1; } + return 0; +/* return i1 - i2; */ +} + +static +void +_print_suffixes(const sauchar_t *T, saidx_t Tsize, + const sauchar_t *P, saidx_t Psize, + const saidx_t *SA, saidx_t SAsize, + saidx_t left, saidx_t size, searchoption_t *option) { + saidx_t *ary; + saidx_t i; + + if(option->flags & SA_SORT_LEXICOGRAPHICALORDER) { + for(i = 0; i < size; ++i) { + _print_suffix(T, Tsize, Psize, SA[left + i], option); + } + } else { + ary = malloc(size * sizeof(saidx_t)); + memcpy(ary, SA + left, size * sizeof(saidx_t)); + qsort(ary, size, sizeof(saidx_t), _idx_cmp); + for(i = 0; i < size; ++i) { + _print_suffix(T, Tsize, Psize, ary[i], option); + } + free(ary); + } +} + +static +void +_search_file(const sauchar_t *T, saidx_t Tsize, + const sauchar_t *P, saidx_t Psize, + const saidx_t *SA, saidx_t SAsize, + searchoption_t *option) { + saidx_t size, left; + size = sa_search(T, Tsize, P, Psize, SA, SAsize, &left); + if(0 <= option->maxcount) { + if(option->maxcount == 0) { return; } + if(option->maxcount < size) { size = option->maxcount; } + } + option->func(T, Tsize, P, Psize, SA, SAsize, left, size, option); +} + +static +void +_print_version(const char *pname) { + fprintf(stderr, +"%s, a SA-based full-text search tool, version %s\n\n", + pname, divsufsort_version()); +} + +static +void +_print_usage(const char *pname, int status) { + _print_version(pname); + fprintf(stderr, +"usage: %s [OPTION]... PATTERN FILE SAFILE\n" +"\nOutput control:\n" +" -m NUM stop after NUM matches\n" +" -b print the byte offset\n" +" -H print the filename\n" +" -c only print a count of matches\n" +" -S sort in lexicographical order\n" +"\nContext control:\n" +" -B NUM print NUM characters of leading context\n" +" -A NUM print NUM characters of trailing context\n" +" -C NUM print NUM characters of output context\n" +"\nMiscellaneous:\n" +" -h print this message\n" +" -v display version number\n" +"\n", pname); + exit(status); +} + +static +void +_print_tryhelp(const char *pname, int status) { + fprintf(stderr, "Try `%s --help' for more information.\n", pname); + exit(status); +} + +static +void +_print_version_and_license(const char *pname, int status) { + _print_version(pname); + fprintf(stderr, + " Copyright (c) 2003-2007 Yuta Mori All Rights Reserved.\n" + "\n" + " Permission is hereby granted, free of charge, to any person\n" + " obtaining a copy of this software and associated documentation\n" + " files (the \"Software\"), to deal in the Software without\n" + " restriction, including without limitation the rights to use,\n" + " copy, modify, merge, publish, distribute, sublicense, and/or sell\n" + " copies of the Software, and to permit persons to whom the\n" + " Software is furnished to do so, subject to the following\n" + " conditions:\n" + "\n" + " The above copyright notice and this permission notice shall be\n" + " included in all copies or substantial portions of the Software.\n" + "\n" + " THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND,\n" + " EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES\n" + " OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\n" + " NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT\n" + " HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,\n" + " WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n" + " FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR\n" + " OTHER DEALINGS IN THE SOFTWARE.\n"); + exit(status); +} + + +int +main(int argc, const char *argv[]) { + int i; + searchoption_t option; + + if(argc <= 1) { _print_usage(argv[0], EXIT_SUCCESS); } + + option.maxcount = -1; + option.fname = NULL; + option.flags = 0; + option.func = _print_suffixes; + option.alen = option.blen = 10; + + for(i = 1; i < argc; ++i) { + if(argv[i][0] != '-') { break; } + + if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) { + _print_usage(argv[0], EXIT_SUCCESS); + } + if((strcmp(argv[i], "-v") == 0) || (strcmp(argv[i], "--version") == 0)) { + _print_version_and_license(argv[0], EXIT_SUCCESS); + } + + if(strcmp(argv[i],"-m")==0) { + if((i + 1) == argc) { + fprintf(stderr,"%s: option requires an argument -- %s\n", argv[0], argv[i]); + _print_tryhelp(argv[0], EXIT_FAILURE); + } + option.maxcount = atoi(argv[++i]); + } else if(strcmp(argv[i],"-b")==0) { + option.flags |= SA_PRINT_OFFSET; + } else if(strcmp(argv[i],"-H")==0) { + option.flags |= SA_PRINT_FILENAME; + } else if(strcmp(argv[i],"-c")==0) { + option.func = _onlyprint_count; + } else if(strcmp(argv[i],"-S")==0) { + option.flags |= SA_SORT_LEXICOGRAPHICALORDER; + } else if(strcmp(argv[i],"-B")==0) { + if((i + 1) == argc) { + fprintf(stderr,"%s: option requires an argument -- %s\n", argv[0], argv[i]); + _print_tryhelp(argv[0], EXIT_FAILURE); + } + option.blen = atoi(argv[++i]); + } else if(strcmp(argv[i],"-A")==0) { + if((i + 1) == argc) { + fprintf(stderr,"%s: option requires an argument -- %s\n", argv[0], argv[i]); + _print_tryhelp(argv[0], EXIT_FAILURE); + } + option.alen = atoi(argv[++i]); + } else if(strcmp(argv[i],"-C")==0) { + if((i + 1) == argc) { + fprintf(stderr,"%s: option requires an argument -- %s\n", argv[0], argv[i]); + _print_tryhelp(argv[0], EXIT_FAILURE); + } + option.alen = option.blen = atoi(argv[++i]); + + } else if(strcmp(argv[i],"--hex")==0) { + option.flags |= SA_HEX_MODE; + + } else { + fprintf(stderr,"%s: invalid option -- %s\n", argv[0], argv[i]); + _print_tryhelp(argv[0], EXIT_FAILURE); + } + } + + if(i == argc) { return 0; } + + sauchar_t *P = (sauchar_t *)argv[i]; + saidx_t Psize = (saidx_t)strlen(argv[i]); + + if(option.flags & SA_HEX_MODE) { + sauchar_t *newP = malloc(Psize / 2 * sizeof(sauchar_t)); + saidx_t j, k; + unsigned char c, t; + if(newP == NULL) { + fprintf(stderr, "%s: Cannot allocate memory.\n", argv[0]); + exit(EXIT_FAILURE); + } + for(j = 0, k = 0, c = 0; j < Psize; ++j) { + if((('0' <= P[j]) && (P[j] <= '9')) || (('a' <= P[j]) && (P[j] <= 'f'))) { + t = (('0' <= P[j]) && (P[j] <= '9')) ? P[j] - '0' : (P[j] - 'a' + 10); + c = (c << 4) | t; + if(k & 1) { newP[k / 2] = c; } + k += 1; + } + } + Psize = k / 2; + P = newP; + } + + for(i += 1; (i + 1) < argc; i += 2) { + FILE *fp; + sauchar_t *T; + saidx_t *SA; + saidx_t size; + +#if HAVE_SYS_STAT_H + struct stat s; + if(stat(argv[i], &s) != 0) { + fprintf(stderr, "%s: Cannot stat file `%s': ", argv[0], argv[i]); + perror(NULL); + exit(EXIT_FAILURE); + } + size = s.st_size; +#endif + + option.fname = argv[i]; + /* Open a file for reading. */ + if((fp = fopen(argv[i], "rb")) == NULL) { + fprintf(stderr, "%s: Cannot open file `%s': ", argv[0], argv[i]); + perror(NULL); + exit(EXIT_FAILURE); + } +#if !HAVE_SYS_STAT_H + if(fseek(fp, 0, SEEK_END) != 0) { + fprintf(stderr, "%s: Cannot fseek on `%s': ", argv[0], argv[i]); + perror(NULL); + exit(EXIT_FAILURE); + } + if((size = ftell(fp)) == -1) { + fprintf(stderr, "%s: Cannot ftell on `%s': ", argv[0], argv[i]); + perror(NULL); + exit(EXIT_FAILURE); + } + rewind(fp); +#endif + + /* Allocate n+4(n+1) bytes of memory. */ + if(((T = malloc(size * sizeof(sauchar_t))) == NULL) || + ((SA = malloc((size + 1) * sizeof(saidx_t))) == NULL)) { + fprintf(stderr, "%s: Cannot allocate memory.\n", argv[0]); + exit(EXIT_FAILURE); + } + + /* Read n * sizeof(sauchar_t) bytes of data. */ + if(fread(T, sizeof(sauchar_t), size, fp) != size) { + fprintf(stderr, "%s: %s `%s': ", + argv[0], + (ferror(fp) || !feof(fp)) ? "Cannot read from" : "Unexpected EOF in", + argv[i]); + perror(NULL); + exit(EXIT_FAILURE); + } + fclose(fp); + + /* Open a SA file for reading. */ + if((fp = fopen(argv[i + 1], "rb")) == NULL) { + fprintf(stderr, "%s: Cannot open file `%s': ", argv[0], argv[i + 1]); + perror(NULL); + exit(EXIT_FAILURE); + } + /* Read (n + 1) * sizeof(saidx_t) bytes of data. */ + if(fread(SA, sizeof(saidx_t), size + 1, fp) != (size + 1)) { + fprintf(stderr, "%s: %s `%s': ", + argv[0], + (ferror(fp) || !feof(fp)) ? "Cannot read from" : "Unexpected EOF in", + argv[i + 1]); + perror(NULL); + exit(EXIT_FAILURE); + } + fclose(fp); + + _search_file(T, size, P, Psize, SA, size + 1, &option); + + free(T); + free(SA); + } + + if(option.flags & SA_HEX_MODE) { + free(P); + } + + return 0; +} diff --git a/examples/suftest.c b/examples/suftest.c new file mode 100644 index 0000000..9d9e2d4 --- /dev/null +++ b/examples/suftest.c @@ -0,0 +1,133 @@ +/* + * suftest.c for libdivsufsort + * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif +#include <divsufsort.h> +#include <stdio.h> +#if HAVE_STDLIB_H +# include <stdlib.h> +#endif +#if HAVE_STRING_H +# if !STDC_HEADERS && HAVE_MEMORY_H +# include <memory.h> +# endif +# include <string.h> +#endif +#if HAVE_STRINGS_H +# include <strings.h> +#endif +#include <time.h> +#if HAVE_SYS_STAT_H +# include <sys/stat.h> +#endif + + +int +main(int argc, const char *argv[]) { + FILE *fp; + sauchar_t *T; + saidx_t *SA; + saidx_t n; + clock_t start, finish; +#if HAVE_SYS_STAT_H + struct stat s; +#endif + + /* Check argument. */ + if((argc != 2) || + (strcmp(argv[1], "-h") == 0) || + (strcmp(argv[1], "--help") == 0)) { + fprintf(stderr, + "suftest, a suffixsort tester, version %s.\n" + , divsufsort_version()); + fprintf(stderr, + "usage: %s FILE\n\n" + , argv[0]); + exit(EXIT_FAILURE); + } + + /* Get a file's status information. */ +#if HAVE_SYS_STAT_H + if(stat(argv[1], &s) != 0) { + fprintf(stderr, "%s: Cannot stat file `%s': ", argv[0], argv[1]); + perror(NULL); + exit(EXIT_FAILURE); + } + n = s.st_size; +#endif + + /* Open a file for reading. */ + if((fp = fopen(argv[1], "rb")) == NULL) { + fprintf(stderr, "%s: Cannot open file `%s': ", argv[0], argv[1]); + perror(NULL); + exit(EXIT_FAILURE); + } + +#if !HAVE_SYS_STAT_H + fseek(fp, 0, SEEK_END); + n = ftell(fp); + rewind(fp); +#endif + + /* Allocate n+4(n+1) bytes of memory. */ + if(((T = malloc(n * sizeof(sauchar_t))) == NULL) || + ((SA = malloc((n + 1) * sizeof(saidx_t))) == NULL)) { + fprintf(stderr, "%s: Cannot allocate memory.\n", argv[0]); + exit(EXIT_FAILURE); + } + + /* Read n bytes of data. */ + if(fread(T, sizeof(sauchar_t), n, fp) != n) { + fprintf(stderr, "%s: %s `%s': ", + argv[0], + (ferror(fp) || !feof(fp)) ? "Cannot read from" : "Unexpected EOF in", + argv[1]); + perror(NULL); + exit(EXIT_FAILURE); + } + fclose(fp); + + /* Construct the suffix array. */ + fprintf(stderr, "%s: %d bytes ... ", argv[1], (int)n); + start = clock(); + divsufsort(T, SA, n); + finish = clock(); + fprintf(stderr, "%.4f sec\n", + (double)(finish - start) / (double)CLOCKS_PER_SEC); + + /* Check the suffix array. */ + if(sufcheck(T, SA, n, 3) != 0) { + exit(EXIT_FAILURE); + } + + /* Deallocate memory. */ + free(SA); + free(T); + + return 0; +} diff --git a/examples/unbwt.c b/examples/unbwt.c new file mode 100644 index 0000000..e989053 --- /dev/null +++ b/examples/unbwt.c @@ -0,0 +1,121 @@ +/* + * unbwt.c for libdivsufsort + * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif +#include <divsufsort.h> +#include <stdio.h> +#if HAVE_STDLIB_H +# include <stdlib.h> +#endif +#if HAVE_STRING_H +# if !STDC_HEADERS && HAVE_MEMORY_H +# include <memory.h> +# endif +# include <string.h> +#endif +#if HAVE_STRINGS_H +# include <strings.h> +#endif +#include <time.h> + + +int +main(int argc, const char *argv[]) { + sauchar_t *T; + saidx_t *A, m, n, blocksize, idx; + saint_t err; + clock_t start, finish; + + /* Check argument. */ + if(argc != 1) { + fprintf(stderr, + "unbwt, an inverse burrows-wheeler transform program, version %s.\n" + , divsufsort_version()); + fprintf(stderr, + "usage: %s < STDIN > STDOUT\n\n" + , argv[0]); + return 0; + } + + /* Read the blocksize from stdin. */ + if(fread(&blocksize, sizeof(saidx_t), 1, stdin) != 1) { + fprintf(stderr, "%s: %s `stdin': ", + argv[0], + (ferror(stdin) || !feof(stdin)) ? + "Cannot read from" : "Unexpected EOF in"); + perror(NULL); + exit(EXIT_FAILURE); + } + + /* Allocate blocksize+4(blocksize+1) bytes of memory. */ + if(((T = malloc(blocksize * sizeof(sauchar_t))) == NULL) || + ((A = malloc((blocksize + 1) * sizeof(saidx_t))) == NULL)) { + fprintf(stderr, "%s: Cannot allocate memory.\n", argv[0]); + exit(EXIT_FAILURE); + } + + fprintf(stderr, "UnBWT (blocksize %d) ... ", (int)blocksize); + start = clock(); + for(n = 0; fread(&idx,sizeof(saidx_t),1,stdin)!=0; n += m) { + /* Read blocksize bytes of data. */ + if((m = fread(T, sizeof(sauchar_t), blocksize, stdin)) == 0) { + fprintf(stderr, "%s: Unexpected EOF in `stdin': ", argv[0]); + perror(NULL); + exit(EXIT_FAILURE); + } + + /* Inverse Burrows-Wheeler Transform. */ + if((err = inverse_bw_transform(T, T, A, m, idx)) != 0) { + fprintf(stderr, "%s (inverse_bw_transform): %s.\n", + argv[0], + (err == -1) ? "Invalid arguments" : "Cannot allocate memory"); + exit(EXIT_FAILURE); + } + + /* Write m bytes of data. */ + if(fwrite(T, sizeof(sauchar_t), m, stdout) != m) { + fprintf(stderr, "%s: Cannot write to `stdout': ", argv[0]); + perror(NULL); + exit(EXIT_FAILURE); + } + } + if(ferror(stdin)) { + fprintf(stderr, "%s: Cannot read from `stdin': ", argv[0]); + perror(NULL); + exit(EXIT_FAILURE); + } + finish = clock(); + fprintf(stderr, "%d bytes: %.4f sec\n", + (int)n, (double)(finish - start) / (double)CLOCKS_PER_SEC); + + /* Deallocate memory. */ + free(T); + free(A); + + return 0; +} diff --git a/include/Makefile.am b/include/Makefile.am new file mode 100644 index 0000000..3bd3b45 --- /dev/null +++ b/include/Makefile.am @@ -0,0 +1,4 @@ +# Makefile.am for libdivsufsort + +nodist_include_HEADERS = divsufsort.h +EXTRA_DIST = divsufsort_private.h.in divsufsort.h.in diff --git a/include/divsufsort.h.in b/include/divsufsort.h.in new file mode 100644 index 0000000..742880c --- /dev/null +++ b/include/divsufsort.h.in @@ -0,0 +1,132 @@ +/* + * divsufsort.h for libdivsufsort + * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _DIVSUFSORT_H_ +#define _DIVSUFSORT_H_ + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +#if @_HAVE_INTTYPES_H_@ /* _HAVE_INTTYPES_H_ */ +# include <inttypes.h> +#else +# if @_HAVE_STDINT_H_@ /* _HAVE_STDINT_H_ */ +# include <stdint.h> +# else + +#ifndef _UINT8_T +#define _UINT8_T +typedef unsigned char uint8_t; +#endif /* _UINT8_T */ +#ifndef _INT32_T +#define _INT32_T +typedef int int32_t; +#endif /* _INT32_T */ + +# endif +#endif + +/*- Datatypes -*/ +typedef int32_t saint_t; +typedef int32_t saidx_t; +typedef uint8_t sauchar_t; + + +/*- Prototypes -*/ + +/** + * Constructs the suffix array of a given string. + * @param T[0..n-1] The input string. + * @param SA[0..n] The output array of suffixes. + * @param n The length of the given string. + * @return 0 if no error occurred, -1 or -2 otherwise. + */ +saint_t +divsufsort(const sauchar_t *T, saidx_t *SA, saidx_t n); + +/** + * Constructs the burrows-wheeler transformed string of a given string. + * @param T[0..n-1] The input string. + * @param U[0..n-1] The output string. (can be T) + * @param A[0..n] The temporary array. (can be NULL) + * @param n The length of the given string. + * @return The primary index if no error occurred, -1 or -2 otherwise. + */ +saidx_t +divbwt(const sauchar_t *T, sauchar_t *U, saidx_t *A, saidx_t n); + +/** + * Returns the version of the divsufsort library. + * @return The version number string. + */ +const char * +divsufsort_version(void); + + +/* Burrows-Wheeler transform. */ +saint_t +bw_transform(const sauchar_t *T, sauchar_t *U, + saidx_t *SA /* can NULL */, + saidx_t n, saidx_t *idx); + +/* Inverse Burrows-Wheeler transform. */ +saint_t +inverse_bw_transform(const sauchar_t *T, sauchar_t *U, + saidx_t *A /* can NULL */, + saidx_t n, saidx_t idx); + +/** + * Checks the correctness of a given suffix array. + * @param T[0..n-1] The input string. + * @param SA[0..n] The input suffix array. + * @param n The length of the given string. + * @param verbose The verbose mode. + * @return 0 if no error occurred. + */ +saint_t +sufcheck(const sauchar_t *T, const saidx_t *SA, saidx_t n, saint_t verbose); + + +/* Search for the pattern P in the string T. */ +saidx_t +sa_search(const sauchar_t *T, saidx_t Tsize, + const sauchar_t *P, saidx_t Psize, + const saidx_t *SA, saidx_t SAsize, + saidx_t *left); + +/* Search for the character c in the string T. */ +saidx_t +sa_simplesearch(const sauchar_t *T, saidx_t Tsize, + const saidx_t *SA, saidx_t SAsize, + saint_t c, saidx_t *left); + + +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ + +#endif /* _DIVSUFSORT_H_ */ diff --git a/include/divsufsort_private.h.in b/include/divsufsort_private.h.in new file mode 100644 index 0000000..f0d5e02 --- /dev/null +++ b/include/divsufsort_private.h.in @@ -0,0 +1,162 @@ +/* + * divsufsort_private.h for libdivsufsort + * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _DIVSUFSORT_PRIVATE_H_ +#define _DIVSUFSORT_PRIVATE_H_ + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif +#include <assert.h> +#include <stdio.h> +#if STDC_HEADERS +# include <stdlib.h> +# include <stddef.h> +#else +# if HAVE_STDLIB_H +# include <stdlib.h> +# endif +#endif +#if HAVE_STRING_H +# if !STDC_HEADERS && HAVE_MEMORY_H +# include <memory.h> +# endif +# include <string.h> +#endif +#if HAVE_STRINGS_H +# include <strings.h> +#endif +#if HAVE_INTTYPES_H +# include <inttypes.h> +#else +# if HAVE_STDINT_H +# include <stdint.h> +# endif +#endif +#include "divsufsort.h" + + +/*- Constants -*/ +#if !defined(UINT8_MAX) +# define UINT8_MAX (255) +#endif /* UINT8_MAX */ +#if defined(ALPHABET_SIZE) && (ALPHABET_SIZE < 1) +# undef ALPHABET_SIZE +#endif +#if !defined(ALPHABET_SIZE) +# define ALPHABET_SIZE (UINT8_MAX + 1) +#endif +#if defined(STACK_SIZE) && (STACK_SIZE < 32) +# undef STACK_SIZE +#endif +#if !defined(STACK_SIZE) +# define STACK_SIZE (64) +#endif +#if defined(LOCALMERGE_BUFFERSIZE) && (LOCALMERGE_BUFFERSIZE < 32) +# undef LOCALMERGE_BUFFERSIZE +#endif +#if !defined(LOCALMERGE_BUFFERSIZE) +# define LOCALMERGE_BUFFERSIZE (256) +#endif +/* for divsufsort.c */ +#define BUCKET_A_SIZE (ALPHABET_SIZE) +#define BUCKET_B_SIZE (ALPHABET_SIZE * ALPHABET_SIZE) +/* for sssort.c */ +#define SS_INSERTIONSORT_THRESHOLD (8) +#define SS_BLOCKSIZE (1024) +/* for trsort.c */ +#define LS_INSERTIONSORT_THRESHOLD (8) +#define TR_INSERTIONSORT_THRESHOLD (8) + + +/*- Macros -*/ +#ifndef SWAP +# define SWAP(a,b) do { t = (a); (a) = (b); (b) = t; } while(0) +#endif /* SWAP */ +#ifndef MIN +# define MIN(a,b) (((a) < (b)) ? (a) : (b)) +#endif /* MIN */ +#ifndef MAX +# define MAX(a,b) (((a) > (b)) ? (a) : (b)) +#endif /* MAX */ +#define STACK_PUSH3(_a, _b, _c)\ + do {\ + assert(ssize < STACK_SIZE);\ + stack[ssize].a = (_a), stack[ssize].b = (_b),\ + stack[ssize++].c = (_c);\ + } while(0) +#define STACK_PUSH(_a, _b, _c, _d)\ + do {\ + assert(ssize < STACK_SIZE);\ + stack[ssize].a = (_a), stack[ssize].b = (_b),\ + stack[ssize].c = (_c), stack[ssize++].d = (_d);\ + } while(0) +#define STACK_POP3(_a, _b, _c)\ + do {\ + assert(0 <= ssize);\ + if(ssize == 0) { return; }\ + (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\ + (_c) = stack[ssize].c;\ + } while(0) +#define STACK_POP(_a, _b, _c, _d)\ + do {\ + assert(0 <= ssize);\ + if(ssize == 0) { return; }\ + (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\ + (_c) = stack[ssize].c, (_d) = stack[ssize].d;\ + } while(0) +/* for divsufsort.c */ +#define BUCKET_A(c0) bucket_A[(c0)] +#if ALPHABET_SIZE == 256 +#define BUCKET_B(c0, c1) (bucket_B[((c1) << 8) | (c0)]) +#define BUCKET_BSTAR(c0, c1) (bucket_B[((c0) << 8) | (c1)]) +#else +#define BUCKET_B(c0, c1) (bucket_B[(c1) * ALPHABET_SIZE + (c0)]) +#define BUCKET_BSTAR(c0, c1) (bucket_B[(c0) * ALPHABET_SIZE + (c1)]) +#endif + + +/*- Private Prototypes -*/ +/* substringsort.c */ +void +substringsort(const sauchar_t *Td, const saidx_t *PA, + saidx_t *first, saidx_t *last, + saidx_t *buf, saidx_t bufsize, + saidx_t depth, saint_t lastsuffix); +/* trsort.c */ +void +trsort(saidx_t *ISA, saidx_t *SA, saidx_t n, saidx_t depth); + + +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ + +#endif /* _DIVSUFSORT_PRIVATE_H_ */ diff --git a/lib/Makefile.am b/lib/Makefile.am new file mode 100644 index 0000000..cf34c43 --- /dev/null +++ b/lib/Makefile.am @@ -0,0 +1,9 @@ +# Makefile.am for libdivsufsort + +lib_LTLIBRARIES = libdivsufsort.la +libdivsufsort_la_SOURCES = divsufsort.c substringsort.c trsort.c utils.c +libdivsufsort_la_LDFLAGS = -version-info @LT_CURRENT@:@LT_REVISION@:@LT_AGE@ @LT_NOUNDEF@ -export-symbols libdivsufsort.sym + +AM_CFLAGS = @A_CFLAGS@ + +EXTRA_DIST = libdivsufsort.sym diff --git a/lib/divsufsort.c b/lib/divsufsort.c new file mode 100644 index 0000000..8615246 --- /dev/null +++ b/lib/divsufsort.c @@ -0,0 +1,362 @@ +/* + * divsufsort.c for libdivsufsort + * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "divsufsort_private.h" + + +/*- Private Functions -*/ + +/* Sorts suffixes of type B*. */ +static +saidx_t +_sort_typeBstar(const sauchar_t *T, saidx_t *SA, + saidx_t *bucket_A, saidx_t *bucket_B, + saidx_t n) { + saidx_t *PAb, *ISAb, *buf; + saidx_t i, j, k, t, m, bufsize; + saint_t c0, c1; + + /* Initialize bucket arrays. */ + for(i = 0; i < BUCKET_A_SIZE; ++i) { bucket_A[i] = 0; } + for(i = 0; i < BUCKET_B_SIZE; ++i) { bucket_B[i] = 0; } + + /* Count the number of occurrences of the first one or two characters of each + type A, B and B* suffix. Moreover, store the beginning position of all + type B* suffixes into the array SA. */ + for(i = n - 1, m = n; 0 <= i;) { + /* type A suffix. */ + do { ++BUCKET_A(T[i]); } while((0 <= --i) && (T[i] >= T[i + 1])); + if(0 <= i) { + /* type B* suffix. */ + ++BUCKET_BSTAR(T[i], T[i + 1]); + SA[--m] = i; + /* type B suffix. */ + for(--i; (0 <= i) && (T[i] <= T[i + 1]); --i) { + ++BUCKET_B(T[i], T[i + 1]); + } + } + } + m = n - m; +/* +note: + A type B* suffix is lexicographically smaller than a type B suffix that + begins with the same first two characters. +*/ + + /* Calculate the index of start/end point of each bucket. */ + for(c0 = 0, i = 0, j = 0; c0 < ALPHABET_SIZE; ++c0) { + t = i + BUCKET_A(c0); + BUCKET_A(c0) = i + j; /* start point */ + i = t + BUCKET_B(c0, c0); + for(c1 = c0 + 1; c1 < ALPHABET_SIZE; ++c1) { + j += BUCKET_BSTAR(c0, c1); + BUCKET_BSTAR(c0, c1) = j; /* end point */ + i += BUCKET_B(c0, c1); + } + } + + if(0 < m) { + /* Sort the type B* suffixes by their first two characters. */ + PAb = SA + n - m; ISAb = SA + m; + PAb[m] = n - 2; /* for sentinel. */ + for(i = m - 2; 0 <= i; --i) { + t = PAb[i], c0 = T[t], c1 = T[t + 1]; + SA[--BUCKET_BSTAR(c0, c1)] = i; + } + t = PAb[m - 1], c0 = T[t], c1 = T[t + 1]; + SA[--BUCKET_BSTAR(c0, c1)] = m - 1; + + /* Sort the type B* substrings using sssort. */ + buf = SA + m, bufsize = n - (2 * m); + if(bufsize <= LOCALMERGE_BUFFERSIZE) { + if((buf = malloc(LOCALMERGE_BUFFERSIZE * sizeof(saidx_t))) == NULL) { + return -1; + } + bufsize = LOCALMERGE_BUFFERSIZE; + } + for(c0 = ALPHABET_SIZE - 1, j = m; 0 < j; --c0) { + for(c1 = ALPHABET_SIZE - 1; c0 < c1; j = i, --c1) { + i = BUCKET_BSTAR(c0, c1); + if(1 < (j - i)) { + substringsort(T, PAb, SA + i, SA + j, + buf, bufsize, 2, *(SA + i) == (m - 1)); + } + } + } + if(bufsize == LOCALMERGE_BUFFERSIZE) { free(buf); } + + /* Compute ranks of type B* substrings. */ + for(i = m - 1; 0 <= i; --i) { + if(0 <= SA[i]) { + j = i; + do { ISAb[SA[i]] = i; } while((0 <= --i) && (0 <= SA[i])); + SA[i + 1] = i - j; + if(i <= 0) { break; } + } + j = i; + do { ISAb[SA[i] = ~SA[i]] = j; } while(SA[--i] < 0); + ISAb[SA[i]] = j; + } + + /* Construct the inverse suffix array of type B* suffixes using trsort. */ + trsort(ISAb, SA, m, 1); + + /* Set the sorted order of tyoe B* suffixes. */ + for(i = n - 1, j = m; 0 <= i;) { + for(--i; (0 <= i) && (T[i] >= T[i + 1]); --i) { } + if(0 <= i) { + SA[ISAb[--j]] = i; + for(--i; (0 <= i) && (T[i] <= T[i + 1]); --i) { } + } + } + + /* Calculate the index of start/end point of each bucket. */ + for(c0 = ALPHABET_SIZE - 1, i = n, k = m - 1; 0 <= c0; --c0) { + for(c1 = ALPHABET_SIZE - 1; c0 < c1; --c1) { + t = i - BUCKET_B(c0, c1); + BUCKET_B(c0, c1) = i + 1; /* end point */ + + /* Move all type B* suffixes to the correct position. */ + for(i = t, j = BUCKET_BSTAR(c0, c1); + j <= k; + --i, --k) { SA[i] = SA[k]; } + } + t = i - BUCKET_B(c0, c0); + BUCKET_B(c0, c0) = i + 1; /* end point */ + if(c0 < (ALPHABET_SIZE - 1)) { + BUCKET_BSTAR(c0, c0 + 1) = t + 1; /* start point */ + } + i = BUCKET_A(c0); + } + } + + return m; +} + +/* Constructs the suffix array by using the sorted order of type B* suffixes. */ +static +void +_construct_SA(const sauchar_t *T, saidx_t *SA, + saidx_t *bucket_A, saidx_t *bucket_B, + saidx_t n, saidx_t m) { + saidx_t *i, *j, *t; + saidx_t s; + saint_t c0, c1, c2; + + /** An implementation version of MSufSort3's second stage. **/ + + if(0 < m) { + /* Construct the sorted order of type B suffixes by using + the sorted order of type B* suffixes. */ + for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) { + /* Scan the suffix array from right to left. */ + for(i = SA + BUCKET_BSTAR(c1, c1 + 1), + j = SA + BUCKET_A(c1 + 1), t = NULL, c2 = -1; + i <= j; + --j) { + if(0 <= (s = *j)) { + if((0 <= --s) && ((c0 = T[s]) <= c1)) { + *j = ~(s + 1); + if((0 < s) && (T[s - 1] > c0)) { s = ~s; } + if(c2 == c0) { *--t = s; } + else { + if(0 <= c2) { BUCKET_B(c2, c1) = t - SA; } + *(t = SA + BUCKET_B(c2 = c0, c1) - 1) = s; + } + } + } else { + *j = ~s; + } + } + } + } + + /* Construct the suffix array by using + the sorted order of type B suffixes. */ + SA[0] = n; + *(t = SA + BUCKET_A(c2 = T[n - 1]) + 1) = n - 1; + /* Scan the suffix array from left to right. */ + for(i = SA + 1, j = SA + n; i <= j; ++i) { + if(0 <= (s = *i)) { + if((0 <= --s) && ((c0 = T[s]) >= T[s + 1])) { + if((0 < s) && (T[s - 1] < c0)) { s = ~s; } + if(c0 == c2) { *++t = s; } + else { + BUCKET_A(c2) = t - SA; + *(t = SA + BUCKET_A(c2 = c0) + 1) = s; + } + } + } else { + *i = ~s; + } + } +} + +/* Constructs the burrows-wheeler transformed string directly + by using the sorted order of type B* suffixes. */ +static +saidx_t +_construct_BWT(const sauchar_t *T, saidx_t *SA, + saidx_t *bucket_A, saidx_t *bucket_B, + saidx_t n, saidx_t m) { + saidx_t *i, *j, *t, *orig; + saidx_t s; + saint_t c0, c1, c2; + + /** An implementation version of MSufSort3's semidirect BWT. **/ + + if(0 < m) { + /* Construct the sorted order of type B suffixes by using + the sorted order of type B* suffixes. */ + for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) { + /* Scan the suffix array from right to left. */ + for(i = SA + BUCKET_BSTAR(c1, c1 + 1), + j = SA + BUCKET_A(c1 + 1), t = NULL, c2 = -1; + i <= j; + --j) { + if(0 <= (s = *j)) { + if((0 <= --s) && ((c0 = T[s]) <= c1)) { + *j = ~((saidx_t)c0); + if((0 < s) && (T[s - 1] > c0)) { s = ~s; } + if(c0 == c2) { *--t = s; } + else { + if(0 <= c2) { BUCKET_B(c2, c1) = t - SA; } + *(t = SA + BUCKET_B(c2 = c0, c1) - 1) = s; + } + } + } else { + *j = ~s; + } + } + } + } + + /* Construct the BWTed string by using + the sorted order of type B suffixes. */ + c0 = T[s = n - 1]; + SA[0] = c0; + if(T[s - 1] < c0) { s = ~((saidx_t)T[s - 1]); } + *(t = SA + BUCKET_A(c2 = c0) + 1) = s; + /* Scan the suffix array from left to right. */ + for(i = SA + 1, j = SA + n, orig = SA - 1; i <= j; ++i) { + if(0 <= (s = *i)) { + if((0 <= --s) && ((c0 = T[s]) >= T[s + 1])) { + *i = c0; + if((0 < s) && (T[s - 1] < c0)) { s = ~((saidx_t)T[s - 1]); } + if(c0 == c2) { *++t = s; } + else { + BUCKET_A(c2) = t - SA; + *(t = SA + BUCKET_A(c2 = c0) + 1) = s; + } + } else if(s < 0) { orig = i; } + } else { + *i = ~s; + } + } + + return orig - SA; +} + + +/*---------------------------------------------------------------------------*/ + +/*- Function -*/ + +saint_t +divsufsort(const sauchar_t *T, saidx_t *SA, saidx_t n) { + saidx_t *bucket_A, *bucket_B; + saidx_t m; + saint_t err = 0; + + /* Check arguments. */ + if((T == NULL) || (SA == NULL) || (n < 0)) { return -1; } + else if(n == 0) { SA[0] = 0; return 0; } + else if(n == 1) { SA[0] = 1; SA[1] = 0; return 0; } + + bucket_A = malloc(BUCKET_A_SIZE * sizeof(saidx_t)); + bucket_B = malloc(BUCKET_B_SIZE * sizeof(saidx_t)); + + /* Suffixsort. */ + if((bucket_A != NULL) && (bucket_B != NULL)) { + m = _sort_typeBstar(T, SA, bucket_A, bucket_B, n); + if(0 <= m) { + _construct_SA(T, SA, bucket_A, bucket_B, n, m); + } else { + err = -3; + } + } else { + err = -2; + } + + free(bucket_B); + free(bucket_A); + + return err; +} + +saidx_t +divbwt(const sauchar_t *T, sauchar_t *U, saidx_t *A, saidx_t n) { + saidx_t *B; + saidx_t *bucket_A, *bucket_B; + saidx_t m, pidx = -1, i; + saint_t err = 0; + + /* Check arguments. */ + if((T == NULL) || (U == NULL) || (n < 0)) { return -1; } + else if(n <= 1) { if(n == 1) { U[0] = T[0]; } return n; } + + if((B = A) == NULL) { B = malloc((n + 1) * sizeof(saidx_t)); } + bucket_A = malloc(BUCKET_A_SIZE * sizeof(saidx_t)); + bucket_B = malloc(BUCKET_B_SIZE * sizeof(saidx_t)); + + /* Burrows-Wheeler Transform. */ + if((B != NULL) && (bucket_A != NULL) && (bucket_B != NULL)) { + m = _sort_typeBstar(T, B, bucket_A, bucket_B, n); + if(0 <= m) { + pidx = _construct_BWT(T, B, bucket_A, bucket_B, n, m); + + /* Copy to output string. */ + for(i = 0; i < pidx; ++i) { U[i] = B[i]; } + for(; i < n; ++i) { U[i] = B[i + 1]; } + } else { + err = -3; + } + } else { + err = -2; + } + + free(bucket_B); + free(bucket_A); + if(A == NULL) { free(B); } + + return pidx; +} + + +const char * +divsufsort_version(void) { + return PACKAGE_VERSION; +} diff --git a/lib/libdivsufsort.sym b/lib/libdivsufsort.sym new file mode 100644 index 0000000..2dcda95 --- /dev/null +++ b/lib/libdivsufsort.sym @@ -0,0 +1,8 @@ +divsufsort +divbwt +divsufsort_version +bw_transform +inverse_bw_transform +sufcheck +sa_search +sa_simplesearch diff --git a/lib/substringsort.c b/lib/substringsort.c new file mode 100644 index 0000000..e628f94 --- /dev/null +++ b/lib/substringsort.c @@ -0,0 +1,619 @@ +/* + * substringsort.c for libdivsufsort + * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "divsufsort_private.h" + + +/*- Private Functions -*/ + +/* Compares two suffixes. */ +static inline +saint_t +_compare(const sauchar_t *T, + const saidx_t *p1, const saidx_t *p2, + saidx_t depth) { + const sauchar_t *U1, *U2, *U1n, *U2n; + + for(U1 = T + depth + *p1, + U2 = T + depth + *p2, + U1n = T + *(p1 + 1) + 2, + U2n = T + *(p2 + 1) + 2; + (U1 < U1n) && (U2 < U2n) && (*U1 == *U2); + ++U1, ++U2) { + } + + return U1 < U1n ? + (U2 < U2n ? *U1 - *U2 : 1) : + (U2 < U2n ? -1 : 0); +} + + +/*---------------------------------------------------------------------------*/ + +/* Insertionsort for small size groups */ +static +void +_insertionsort(const sauchar_t *T, const saidx_t *PA, + saidx_t *first, saidx_t *last, saidx_t depth) { + saidx_t *i, *j; + saidx_t t; + saint_t r; + + for(i = last - 2; first <= i; --i) { + for(t = *i, j = i + 1; 0 < (r = _compare(T, PA + t, PA + *j, depth));) { + do { *(j - 1) = *j; } while((++j < last) && (*j < 0)); + if(last <= j) { break; } + } + if(r == 0) { *j = ~*j; } + *(j - 1) = t; + } +} + + +/*---------------------------------------------------------------------------*/ + +static inline +void +_fixdown(const sauchar_t *Td, const saidx_t *PA, + saidx_t *SA, saidx_t i, saidx_t size) { + saidx_t j, k; + saidx_t v; + saint_t c, d, e; + + for(v = SA[i], c = Td[PA[v]]; (j = 2 * i + 1) < size; SA[i] = SA[k], i = k) { + d = Td[PA[SA[k = j++]]]; + if(d < (e = Td[PA[SA[j]]])) { k = j; d = e; } + if(d <= c) { break; } + } + SA[i] = v; +} + +/* Simple top-down heapsort. */ +static +void +_heapsort(const sauchar_t *Td, const saidx_t *PA, saidx_t *SA, saidx_t size) { + saidx_t i, m; + saidx_t t; + + m = size; + if((size % 2) == 0) { + m--; + if(Td[PA[SA[m / 2]]] < Td[PA[SA[m]]]) { SWAP(SA[m], SA[m / 2]); } + } + + for(i = m / 2 - 1; 0 <= i; --i) { _fixdown(Td, PA, SA, i, m); } + + if((size % 2) == 0) { + SWAP(SA[0], SA[m]); + _fixdown(Td, PA, SA, 0, m); + } + + for(i = m - 1; 0 < i; --i) { + t = SA[0]; + SA[0] = SA[i]; + _fixdown(Td, PA, SA, 0, i); + SA[i] = t; + } +} + + +/*---------------------------------------------------------------------------*/ + +/* Returns the median of three elements. */ +static inline +saidx_t * +_median3(const sauchar_t *Td, const saidx_t *PA, + saidx_t *v1, saidx_t *v2, saidx_t *v3) { + saidx_t *t; + if(Td[PA[*v1]] > Td[PA[*v2]]) { SWAP(v1, v2); } + if(Td[PA[*v2]] > Td[PA[*v3]]) { + if(Td[PA[*v1]] > Td[PA[*v3]]) { return v1; } + else { return v3; } + } + return v2; +} + +/* Returns the median of five elements. */ +static inline +saidx_t * +_median5(const sauchar_t *Td, const saidx_t *PA, + saidx_t *v1, saidx_t *v2, saidx_t *v3, saidx_t *v4, saidx_t *v5) { + saidx_t *t; + if(Td[PA[*v2]] > Td[PA[*v3]]) { SWAP(v2, v3); } + if(Td[PA[*v4]] > Td[PA[*v5]]) { SWAP(v4, v5); } + if(Td[PA[*v2]] > Td[PA[*v4]]) { SWAP(v2, v4); SWAP(v3, v5); } + if(Td[PA[*v1]] > Td[PA[*v3]]) { SWAP(v1, v3); } + if(Td[PA[*v1]] > Td[PA[*v4]]) { SWAP(v1, v4); SWAP(v3, v5); } + if(Td[PA[*v3]] > Td[PA[*v4]]) { return v4; } + return v3; +} + +/* Returns the pivot element. */ +static inline +saidx_t * +_pivot(const sauchar_t *Td, const saidx_t *PA, saidx_t *first, saidx_t *last) { + saidx_t *middle; + saidx_t t; + + t = last - first; + middle = first + t / 2; + + if(t <= 512) { + if(t <= 32) { + return _median3(Td, PA, first, middle, last - 1); + } else { + t >>= 2; + return _median5(Td, PA, + first, first + t, middle, + last - 1 - t, last - 1); + } + } + t >>= 3; + return _median3(Td, PA, + _median3(Td, PA, first, first + t, first + (t << 1)), + _median3(Td, PA, middle - t, middle, middle + t), + _median3(Td, PA, last - 1 - (t << 1), last - 1 - t, last - 1)); +} + + +/*---------------------------------------------------------------------------*/ + +static inline +saidx_t +_lg(saidx_t n) { +static const int log2table[256]= { + -1,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, + 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, + 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, + 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7 +}; + /* for 16 bits */ + return ((n & 0xff00) != 0) ? + 8 + log2table[(n >> 8) & 0xff] : + log2table[n & 0xff]; + + /* for 32 bits */ +/* + return (n & 0xffff0000) ? + ((n & 0xff000000) ? + 24 + log2table[(n >> 24) & 0xff] : + 16 + log2table[(n >> 16) & 0xff]) : + ((n & 0x0000ff00) ? + 8 + log2table[(n >> 8) & 0xff] : + 0 + log2table[(n >> 0) & 0xff]); +*/ +} + +/* Binary partition for substrings. */ +static inline +saidx_t * +_substring_partition(const saidx_t *PA, + saidx_t *first, saidx_t *last, saidx_t depth) { + saidx_t *a, *b; + saidx_t t; + for(a = first - 1, b = last;;) { + for(; (++a < b) && ((PA[*a] + depth) >= (PA[*a + 1] + 1));) { *a = ~*a; } + for(; (a < --b) && ((PA[*b] + depth) < (PA[*b + 1] + 1));) { } + if(b <= a) { break; } + t = ~*b; + *b = *a; + *a = t; + } + if(first < a) { *first = ~*first; } + return a; +} + +/* Multikey introsort for medium size groups. */ +static +void +_multikey_introsort(const sauchar_t *T, const saidx_t *PA, + saidx_t *first, saidx_t *last, + saidx_t depth) { + struct { saidx_t *a, *b, c, d; } stack[STACK_SIZE]; + const sauchar_t *Td; + saidx_t *a, *b, *c, *d, *e, *f; + saidx_t s, t; + saidx_t ssize; + saidx_t limit; + saint_t v, x = 0; + + for(ssize = 0, limit = _lg(last - first);;) { + + if((last - first) <= SS_INSERTIONSORT_THRESHOLD) { + if(1 < (last - first)) { _insertionsort(T, PA, first, last, depth); } + STACK_POP(first, last, depth, limit); + continue; + } + + Td = T + depth; + if(limit-- == 0) { _heapsort(Td, PA, first, last - first); } + if(limit < 0) { + for(a = first + 1, v = Td[PA[*first]]; a < last; ++a) { + if((x = Td[PA[*a]]) != v) { + if(1 < (a - first)) { break; } + v = x; + first = a; + } + } + if(Td[PA[*first] - 1] < v) { + first = _substring_partition(PA, first, a, depth); + } + if((a - first) <= (last - a)) { + if(1 < (a - first)) { + STACK_PUSH(a, last, depth, -1); + last = a, depth += 1, limit = _lg(a - first); + } else { + first = a, limit = -1; + } + } else { + if(1 < (last - a)) { + STACK_PUSH(first, a, depth + 1, _lg(a - first)); + first = a, limit = -1; + } else { + last = a, depth += 1, limit = _lg(a - first); + } + } + continue; + } + + /* choose pivot */ + a = _pivot(Td, PA, first, last); + v = Td[PA[*a]]; + SWAP(*first, *a); + + /* partition */ + for(b = first; (++b < last) && ((x = Td[PA[*b]]) == v);) { } + if(((a = b) < last) && (x < v)) { + for(; (++b < last) && ((x = Td[PA[*b]]) <= v);) { + if(x == v) { SWAP(*b, *a); ++a; } + } + } + for(c = last; (b < --c) && ((x = Td[PA[*c]]) == v);) { } + if((b < (d = c)) && (x > v)) { + for(; (b < --c) && ((x = Td[PA[*c]]) >= v);) { + if(x == v) { SWAP(*c, *d); --d; } + } + } + for(; b < c;) { + SWAP(*b, *c); + for(; (++b < c) && ((x = Td[PA[*b]]) <= v);) { + if(x == v) { SWAP(*b, *a); ++a; } + } + for(; (b < --c) && ((x = Td[PA[*c]]) >= v);) { + if(x == v) { SWAP(*c, *d); --d; } + } + } + + if(a <= d) { + c = b - 1; + + if((s = a - first) > (t = b - a)) { s = t; } + for(e = first, f = b - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); } + if((s = d - c) > (t = last - d - 1)) { s = t; } + for(e = b, f = last - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); } + + a = first + (b - a), c = last - (d - c); + b = (v <= Td[PA[*a] - 1]) ? a : _substring_partition(PA, a, c, depth); + + if((a - first) <= (last - c)) { + if((last - c) <= (c - b)) { + STACK_PUSH(b, c, depth + 1, _lg(c - b)); + STACK_PUSH(c, last, depth, limit); + last = a; + } else if((a - first) <= (c - b)) { + STACK_PUSH(c, last, depth, limit); + STACK_PUSH(b, c, depth + 1, _lg(c - b)); + last = a; + } else { + STACK_PUSH(c, last, depth, limit); + STACK_PUSH(first, a, depth, limit); + first = b, last = c, depth += 1, limit = _lg(c - b); + } + } else { + if((a - first) <= (c - b)) { + STACK_PUSH(b, c, depth + 1, _lg(c - b)); + STACK_PUSH(first, a, depth, limit); + first = c; + } else if((last - c) <= (c - b)) { + STACK_PUSH(first, a, depth, limit); + STACK_PUSH(b, c, depth + 1, _lg(c - b)); + first = c; + } else { + STACK_PUSH(first, a, depth, limit); + STACK_PUSH(c, last, depth, limit); + first = b, last = c, depth += 1, limit = _lg(c - b); + } + } + } else { + limit += 1; + if(Td[PA[*first] - 1] < v) { + first = _substring_partition(PA, first, last, depth); + limit = _lg(last - first); + } + depth += 1; + } + } +} + + +/*---------------------------------------------------------------------------*/ + +/* Block swapping */ +static inline +void +_block_swap(saidx_t *first1, saidx_t *first2, saidx_t size) { + saidx_t *a, *b; + saidx_t i, t; + for(i = size, a = first1, b = first2; 0 < i; --i, ++a, ++b) { + SWAP(*a, *b); + } +} + +/* Merge-forward with internal buffer. */ +static +void +_merge_forward(const sauchar_t *T, const saidx_t *PA, + saidx_t *buf, saidx_t *first, saidx_t *middle, saidx_t *last, + saidx_t depth) { + saidx_t *bufend; + saidx_t *i, *j, *k; + saidx_t t; + saint_t r; + + bufend = buf + (middle - first) - 1; + _block_swap(buf, first, middle - first); + + for(t = *first, i = first, j = buf, k = middle;;) { + r = _compare(T, PA + *j, PA + *k, depth); + if(r < 0) { + do { + *i++ = *j; + if(bufend <= j) { *j = t; return; } + *j++ = *i; + } while(*j < 0); + } else if(r > 0) { + do { + *i++ = *k, *k++ = *i; + if(last <= k) { + while(j < bufend) { *i++ = *j, *j++ = *i; } + *i = *j, *j = t; + return; + } + } while(*k < 0); + } else { + *k = ~*k; + do { + *i++ = *j; + if(bufend <= j) { *j = t; return; } + *j++ = *i; + } while(*j < 0); + + do { + *i++ = *k, *k++ = *i; + if(last <= k) { + while(j < bufend) { *i++ = *j, *j++ = *i; } + *i = *j, *j = t; + return; + } + } while(*k < 0); + } + } +} + +/* Merge-backward with internal buffer. */ +static +void +_merge_backward(const sauchar_t *T, const saidx_t *PA, saidx_t *buf, + saidx_t *first, saidx_t *middle, saidx_t *last, + saidx_t depth) { + const saidx_t *p1, *p2; + saidx_t *bufend; + saidx_t *i, *j, *k; + saidx_t t; + saint_t r; + saint_t x; + + bufend = buf + (last - middle); + _block_swap(buf, middle, last - middle); + + x = 0; + if(*(bufend - 1) < 0) { x |= 1; p1 = PA + ~*(bufend - 1); } + else { p1 = PA + *(bufend - 1); } + if(*(middle - 1) < 0) { x |= 2; p2 = PA + ~*(middle - 1); } + else { p2 = PA + *(middle - 1); } + for(t = *(last - 1), i = last - 1, j = bufend - 1, k = middle - 1;;) { + + r = _compare(T, p1, p2, depth); + if(r > 0) { + if(x & 1) { do { *i-- = *j; *j-- = *i; } while(*j < 0); x ^= 1; } + *i-- = *j; + if(j <= buf) { *j = t; return; } + *j-- = *i; + + if(*j < 0) { x |= 1; p1 = PA + ~*j; } + else { p1 = PA + *j; } + } else if(r < 0) { + if(x & 2) { do { *i-- = *k; *k-- = *i; } while(*k < 0); x ^= 2; } + *i-- = *k; *k-- = *i; + if(k < first) { + while(buf < j) { *i-- = *j; *j-- = *i; } + *i = *j, *j = t; + return; + } + + if(*k < 0) { x |= 2; p2 = PA + ~*k; } + else { p2 = PA + *k; } + } else { + if(x & 1) { do { *i-- = *j; *j-- = *i; } while(*j < 0); x ^= 1; } + *i-- = ~*j; + if(j <= buf) { *j = t; return; } + *j-- = *i; + + if(x & 2) { do { *i-- = *k; *k-- = *i; } while(*k < 0); x ^= 2; } + *i-- = *k; *k-- = *i; + if(k < first) { + while(buf < j) { *i-- = *j; *j-- = *i; } + *i = *j, *j = t; + return; + } + + if(*j < 0) { x |= 1; p1 = PA + ~*j; } + else { p1 = PA + *j; } + if(*k < 0) { x |= 2; p2 = PA + ~*k; } + else { p2 = PA + *k; } + } + } +} + +/* Faster merge (based on divide and conquer technique). */ +static +void +_merge(const sauchar_t *T, const saidx_t *PA, + saidx_t *first, saidx_t *middle, saidx_t *last, + saidx_t *buf, saidx_t bufsize, + saidx_t depth) { +#define GETIDX(a) ((0 <= (a)) ? (a) : (~(a))) +#define MERGE_CHECK_EQUAL(a)\ + do {\ + if((0 <= *(a)) &&\ + (_compare(T, PA + GETIDX(*((a) - 1)), PA + *(a), depth) == 0)) {\ + *(a) = ~*(a);\ + }\ + } while(0) + struct { saidx_t *a, *b, *c; int d; } stack[STACK_SIZE]; + saidx_t *i, *j; + saidx_t m, len, half; + saidx_t ssize; + int check, next; + + for(check = 0, ssize = 0;;) { + + if((last - middle) <= bufsize) { + if((first < middle) && (middle < last)) { + _merge_backward(T, PA, buf, first, middle, last, depth); + } + if(check & 1) { MERGE_CHECK_EQUAL(first); } + if(check & 2) { MERGE_CHECK_EQUAL(last); } + STACK_POP(first, middle, last, check); + continue; + } + + if((middle - first) <= bufsize) { + if(first < middle) { + _merge_forward(T, PA, buf, first, middle, last, depth); + } + if(check & 1) { MERGE_CHECK_EQUAL(first); } + if(check & 2) { MERGE_CHECK_EQUAL(last); } + STACK_POP(first, middle, last, check); + continue; + } + + for(m = 0, len = MIN(middle - first, last - middle), half = len >> 1; + 0 < len; + len = half, half >>= 1) { + if(_compare(T, PA + GETIDX(*(middle + m + half)), + PA + GETIDX(*(middle - m - half - 1)), depth) < 0) { + m += half + 1; + half -= (len & 1) ^ 1; + } + } + + if(0 < m) { + _block_swap(middle - m, middle, m); + i = j = middle, next = 0; + if((middle + m) < last) { + if(*(middle + m) < 0) { + for(; *(i - 1) < 0; --i) { } + *(middle + m) = ~*(middle + m); + } + for(j = middle; *j < 0; ++j) { } + next = 1; + } + if((i - first) <= (last - j)) { + STACK_PUSH(j, middle + m, last, (check & 2) | (next & 1)); + middle -= m, last = i, check = (check & 1); + } else { + if((i == middle) && (middle == j)) { next <<= 1; } + STACK_PUSH(first, middle - m, i, (check & 1) | (next & 2)); + first = j, middle += m, check = (check & 2) | (next & 1); + } + } else { + if(check & 1) { MERGE_CHECK_EQUAL(first); } + MERGE_CHECK_EQUAL(middle); + if(check & 2) { MERGE_CHECK_EQUAL(last); } + STACK_POP(first, middle, last, check); + } + } +} + + +/*---------------------------------------------------------------------------*/ + +/*- Function -*/ + +/* Substring sort */ +void +substringsort(const sauchar_t *T, const saidx_t *PA, + saidx_t *first, saidx_t *last, + saidx_t *buf, saidx_t bufsize, + saidx_t depth, saint_t lastsuffix) { + saidx_t *a, *b; + saidx_t *curbuf; + saidx_t i, j, k; + saidx_t curbufsize; + + if(lastsuffix != 0) { ++first; } + for(a = first, i = 0; (a + SS_BLOCKSIZE) < last; a += SS_BLOCKSIZE, ++i) { + _multikey_introsort(T, PA, a, a + SS_BLOCKSIZE, depth); + curbuf = a + SS_BLOCKSIZE; + curbufsize = last - (a + SS_BLOCKSIZE); + if(curbufsize <= bufsize) { curbufsize = bufsize, curbuf = buf; } + for(b = a, k = SS_BLOCKSIZE, j = i; j & 1; b -= k, k <<= 1, j >>= 1) { + _merge(T, PA, b - k, b, b + k, curbuf, curbufsize, depth); + } + } + _multikey_introsort(T, PA, a, last, depth); + for(k = SS_BLOCKSIZE; i != 0; k <<= 1, i >>= 1) { + if(i & 1) { + _merge(T, PA, a - k, a, last, buf, bufsize, depth); + a -= k; + } + } + + if(lastsuffix != 0) { + /* Insert last type B* suffix. */ + for(a = first, i = *(first - 1); + (a < last) && ((*a < 0) || (0 < _compare(T, PA + i, PA + *a, depth))); + ++a) { + *(a - 1) = *a; + } + *(a - 1) = i; + } +} diff --git a/lib/trsort.c b/lib/trsort.c new file mode 100644 index 0000000..9e5670e --- /dev/null +++ b/lib/trsort.c @@ -0,0 +1,684 @@ +/* + * trsort.c for libdivsufsort + * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "divsufsort_private.h" + + +/*- Private Functions -*/ + +static inline +void +_fixdown(const saidx_t *ISAd, saidx_t *SA, saidx_t i, saidx_t size) { + saidx_t j, k; + saidx_t v; + saidx_t c, d, e; + + for(v = SA[i], c = ISAd[v]; (j = 2 * i + 1) < size; SA[i] = SA[k], i = k) { + d = ISAd[SA[k = j++]]; + if(d < (e = ISAd[SA[j]])) { k = j; d = e; } + if(d <= c) { break; } + } + SA[i] = v; +} + +/* Simple top-down heapsort. */ +static +void +_heapsort(const saidx_t *ISAd, saidx_t *SA, saidx_t size) { + saidx_t i, m; + saidx_t t; + + m = size; + if((size % 2) == 0) { + m--; + if(ISAd[SA[m / 2]] < ISAd[SA[m]]) { + SWAP(SA[m], SA[m / 2]); + } + } + + for(i = m / 2 - 1; 0 <= i; --i) { + _fixdown(ISAd, SA, i, m); + } + + if((size % 2) == 0) { + SWAP(SA[0], SA[m]); + _fixdown(ISAd, SA, 0, m); + } + + for(i = m - 1; 0 < i; --i) { + t = SA[0]; + SA[0] = SA[i]; + _fixdown(ISAd, SA, 0, i); + SA[i] = t; + } +} + +/* Simple insertionsort for small size groups. */ +static +void +_insertionsort(const saidx_t *ISAd, saidx_t *first, saidx_t *last) { + saidx_t *a, *b; + saidx_t t, r; + + for(a = first + 1; a < last; ++a) { + for(t = *a, b = a - 1; 0 > (r = ISAd[t] - ISAd[*b]);) { + do { *(b + 1) = *b; } while((first <= --b) && (*b < 0)); + if(b < first) { break; } + } + if(r == 0) { *b = ~*b; } + *(b + 1) = t; + } +} + +static inline +saidx_t +_lg(saidx_t n) { +static const int log2table[256]= { + -1,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, + 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, + 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, + 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7 +}; + /* for 16 bits */ +/* + return ((n & 0xff00) != 0) ? + 8 + log2table[(n >> 8) & 0xff] : + log2table[n & 0xff]; +*/ + /* for 32 bits */ + return (n & 0xffff0000) ? + ((n & 0xff000000) ? + 24 + log2table[(n >> 24) & 0xff] : + 16 + log2table[(n >> 16) & 0xff]) : + ((n & 0x0000ff00) ? + 8 + log2table[(n >> 8) & 0xff] : + 0 + log2table[(n >> 0) & 0xff]); +} + + +/*---------------------------------------------------------------------------*/ + +/* Returns the median of three elements. */ +static inline +saidx_t * +_median3(const saidx_t *ISAd, saidx_t *v1, saidx_t *v2, saidx_t *v3) { + saidx_t *t; + if(ISAd[*v1] > ISAd[*v2]) { SWAP(v1, v2); } + if(ISAd[*v2] > ISAd[*v3]) { + if(ISAd[*v1] > ISAd[*v3]) { return v1; } + else { return v3; } + } + return v2; +} + +/* Returns the median of five elements. */ +static inline +saidx_t * +_median5(const saidx_t *ISAd, + saidx_t *v1, saidx_t *v2, saidx_t *v3, saidx_t *v4, saidx_t *v5) { + saidx_t *t; + if(ISAd[*v2] > ISAd[*v3]) { SWAP(v2, v3); } + if(ISAd[*v4] > ISAd[*v5]) { SWAP(v4, v5); } + if(ISAd[*v2] > ISAd[*v4]) { SWAP(v2, v4); SWAP(v3, v5); } + if(ISAd[*v1] > ISAd[*v3]) { SWAP(v1, v3); } + if(ISAd[*v1] > ISAd[*v4]) { SWAP(v1, v4); SWAP(v3, v5); } + if(ISAd[*v3] > ISAd[*v4]) { return v4; } + return v3; +} + +/* Returns the pivot element. */ +static inline +saidx_t * +_pivot(const saidx_t *ISAd, saidx_t *first, saidx_t *last) { + saidx_t *middle; + saidx_t t; + + t = last - first; + middle = first + t / 2; + + if(t <= 512) { + if(t <= 32) { + return _median3(ISAd, first, middle, last - 1); + } else { + t >>= 2; + return _median5(ISAd, + first, first + t, + middle, + last - 1 - t, last - 1); + } + } + t >>= 3; + return _median3(ISAd, + _median3(ISAd, first, first + t, first + (t << 1)), + _median3(ISAd, middle - t, middle, middle + t), + _median3(ISAd, last - 1 - (t << 1), last - 1 - t, last - 1)); +} + + +/*---------------------------------------------------------------------------*/ + +/* Update group numbers. */ +static +void +_ls_updategroup(saidx_t *ISA, const saidx_t *SA, + saidx_t *first, saidx_t *last) { + saidx_t *a, *b; + saidx_t t; + + for(a = first; a < last; ++a) { + if(0 <= *a) { + b = a; + do { ISA[*a] = a - SA; } while((++a < last) && (0 <= *a)); + *b = b - a; + if(last <= a) { break; } + } + b = a; + do { *a = ~*a; } while(*++a < 0); + t = a - SA; + do { ISA[*b] = t; } while(++b <= a); + } +} + +/* Introspective sort. */ +static +void +_ls_introsort(saidx_t *ISA, saidx_t *ISAd, const saidx_t *SA, + saidx_t *first, saidx_t *last) { + struct { saidx_t *a, *b, c; } stack[STACK_SIZE]; + saidx_t *a, *b, *c, *d, *e, *f; + saidx_t s, t; + saidx_t limit; + saidx_t v, x = 0; + int ssize; + + for(ssize = 0, limit = _lg(last - first);;) { + + if((last - first) <= LS_INSERTIONSORT_THRESHOLD) { + if(1 < (last - first)) { + _insertionsort(ISAd, first, last); + _ls_updategroup(ISA, SA, first, last); + } else if((last - first) == 1) { *first = -1; } + STACK_POP3(first, last, limit); + continue; + } + + if(limit-- == 0) { + _heapsort(ISAd, first, last - first); + for(a = last - 1; first < a; a = b) { + for(x = ISAd[*a], b = a - 1; (first <= b) && (ISAd[*b] == x); --b) { *b = ~*b; } + } + _ls_updategroup(ISA, SA, first, last); + STACK_POP3(first, last, limit); + continue; + } + + /* choose pivot */ + a = _pivot(ISAd, first, last); + SWAP(*first, *a); + v = ISAd[*first]; + + /* Two-stage double-index controlled ternary partition */ + for(b = first; (++b < last) && ((x = ISAd[*b]) == v);) { } + if(((a = b) < last) && (x < v)) { + for(; (++b < last) && ((x = ISAd[*b]) <= v);) { + if(x == v) { SWAP(*b, *a); ++a; } + } + } + for(c = last; (b < --c) && ((x = ISAd[*c]) == v);) { } + if((b < (d = c)) && (x > v)) { + for(; (b < --c) && ((x = ISAd[*c]) >= v);) { + if(x == v) { SWAP(*c, *d); --d; } + } + } + for(; b < c;) { + SWAP(*b, *c); + for(; (++b < c) && ((x = ISAd[*b]) <= v);) { + if(x == v) { SWAP(*b, *a); ++a; } + } + for(; (b < --c) && ((x = ISAd[*c]) >= v);) { + if(x == v) { SWAP(*c, *d); --d; } + } + } + + if(a <= d) { + c = b - 1; + + if((s = a - first) > (t = b - a)) { s = t; } + for(e = first, f = b - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); } + if((s = d - c) > (t = last - d - 1)) { s = t; } + for(e = b, f = last - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); } + + a = first + (b - a), b = last - (d - c); + + /* update ranks */ + for(c = first, v = a - SA - 1; c < a; ++c) { ISA[*c] = v; } + if(b < last) { for(c = a, v = b - SA - 1; c < b; ++c) { ISA[*c] = v; } } + if((b - a) == 1) { *a = - 1; } + + if((a - first) <= (last - b)) { + if(first < a) { + STACK_PUSH3(b, last, limit); + last = a; + } else { + first = b; + } + } else { + if(b < last) { + STACK_PUSH3(first, a, limit); + first = b; + } else { + last = a; + } + } + } else { + STACK_POP3(first, last, limit); + } + } +} + + +/*---------------------------------------------------------------------------*/ + +/* Larsson-Sadakane sort */ +static +void +_lssort(saidx_t *ISA, saidx_t *SA, saidx_t n, saidx_t depth) { + saidx_t *ISAd; + saidx_t *first, *last; + saidx_t t, skip; + + for(ISAd = ISA + depth; -n < *SA; ISAd += (ISAd - ISA)) { + first = SA; + skip = 0; + do { + if((t = *first) < 0) { first -= t; skip += t; } + else { + if(skip != 0) { *(first + skip) = skip; skip = 0; } + last = SA + ISA[t] + 1; + _ls_introsort(ISA, ISAd, SA, first, last); + first = last; + } + } while(first < (SA + n)); + if(skip != 0) { *(first + skip) = skip; } + } +} + + +/*---------------------------------------------------------------------------*/ + +static +void +_tr_partition(const saidx_t *ISAd, const saidx_t *SA, + saidx_t *first, saidx_t *last, + saidx_t **pa, saidx_t **pb, saidx_t v) { + saidx_t *a, *b, *c, *d, *e, *f; + saidx_t t, s; + saidx_t x = 0; + + for(b = first - 1; (++b < last) && ((x = ISAd[*b]) == v);) { } + if(((a = b) < last) && (x < v)) { + for(; (++b < last) && ((x = ISAd[*b]) <= v);) { + if(x == v) { SWAP(*b, *a); ++a; } + } + } + for(c = last; (b < --c) && ((x = ISAd[*c]) == v);) { } + if((b < (d = c)) && (x > v)) { + for(; (b < --c) && ((x = ISAd[*c]) >= v);) { + if(x == v) { SWAP(*c, *d); --d; } + } + } + for(; b < c;) { + SWAP(*b, *c); + for(; (++b < c) && ((x = ISAd[*b]) <= v);) { + if(x == v) { SWAP(*b, *a); ++a; } + } + for(; (b < --c) && ((x = ISAd[*c]) >= v);) { + if(x == v) { SWAP(*c, *d); --d; } + } + } + + if(a <= d) { + c = b - 1; + if((s = a - first) > (t = b - a)) { s = t; } + for(e = first, f = b - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); } + if((s = d - c) > (t = last - d - 1)) { s = t; } + for(e = b, f = last - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); } + first += (b - a), last -= (d - c); + } + *pa = first, *pb = last; +} + +static +void +_tr_copy(saidx_t *ISA, const saidx_t *SA, + saidx_t *first, saidx_t *a, saidx_t *b, saidx_t *last, + saidx_t depth) { + /* sort suffixes of middle partition + by using sorted order of suffixes of left and right partition. */ + saidx_t *c, *d, *e; + saidx_t s, v; + + v = b - SA - 1; + for(c = first, d = a - 1; c <= d; ++c) { + if((0 <= (s = *c - depth)) && (ISA[s] == v)) { + *++d = s; + ISA[s] = d - SA; + } + } + for(c = last - 1, e = d + 1, d = b; e < d; --c) { + if((0 <= (s = *c - depth)) && (ISA[s] == v)) { + *--d = s; + ISA[s] = d - SA; + } + } +} + +/* Multikey introsort. */ +static +void +_tr_introsort(saidx_t *ISA, const saidx_t *ISAd, + saidx_t *SA, saidx_t *first, saidx_t *last, + saidx_t *budget, saidx_t *chance, saidx_t size) { +#define UPDATE_BUDGET(n)\ + {\ + (*budget) -= (n);\ + if(*budget <= 0) {\ + if(--(*chance) == 0) { break; }\ + (*budget) += size;\ + }\ + } + struct { const saidx_t *a; saidx_t *b, *c, d; }stack[STACK_SIZE]; + saidx_t *a, *b, *c, *d, *e, *f; + saidx_t s, t; + saidx_t v, x = 0; + saidx_t limit, next; + int ssize; + + for(ssize = 0, limit = _lg(last - first);;) { + + if(limit < 0) { + if(limit == -1) { + /* tandem repeat partition */ + UPDATE_BUDGET(last - first); + _tr_partition(ISAd - 1, SA, first, last, &a, &b, last - SA - 1); + + /* update ranks */ + if(a < last) { + for(c = first, v = a - SA - 1; c < a; ++c) { ISA[*c] = v; } + } + if(b < last) { + for(c = a, v = b - SA - 1; c < b; ++c) { ISA[*c] = v; } + } + + /* push */ + STACK_PUSH(NULL, a, b, 0); + STACK_PUSH(ISAd - 1, first, last, -2); + if((a - first) <= (last - b)) { + if(1 < (a - first)) { + STACK_PUSH(ISAd, b, last, _lg(last - b)); + last = a, limit = _lg(a - first); + } else if(1 < (last - b)) { + first = b, limit = _lg(last - b); + } else { + STACK_POP(ISAd, first, last, limit); + } + } else { + if(1 < (last - b)) { + STACK_PUSH(ISAd, first, a, _lg(a - first)); + first = b, limit = _lg(last - b); + } else if(1 < (a - first)) { + last = a, limit = _lg(a - first); + } else { + STACK_POP(ISAd, first, last, limit); + } + } + } else if(limit == -2) { + /* tandem repeat copy */ + a = stack[--ssize].b, b = stack[ssize].c; + _tr_copy(ISA, SA, first, a, b, last, ISAd - ISA); + STACK_POP(ISAd, first, last, limit); + } else { + /* sorted partition */ + if(0 <= *first) { + a = first; + do { ISA[*a] = a - SA; } while((++a < last) && (0 <= *a)); + first = a; + } + if(first < last) { + a = first; do { *a = ~*a; } while(*++a < 0); + next = (ISA[*a] != ISAd[*a]) ? _lg(a - first + 1) : -1; + if(++a < last) { for(b = first, v = a - SA - 1; b < a; ++b) { ISA[*b] = v; } } + + /* push */ + if((a - first) <= (last - a)) { + STACK_PUSH(ISAd, a, last, -3); + ISAd += 1, last = a, limit = next; + } else { + if(1 < (last - a)) { + STACK_PUSH(ISAd + 1, first, a, next); + first = a, limit = -3; + } else { + ISAd += 1, last = a, limit = next; + } + } + } else { + STACK_POP(ISAd, first, last, limit); + } + } + continue; + } + + if((last - first) <= TR_INSERTIONSORT_THRESHOLD) { + UPDATE_BUDGET(last - first); +// UPDATE_BUDGET((last - first) * (last - first) / 2); + _insertionsort(ISAd, first, last); + limit = -3; + continue; + } + + if(limit-- == 0) { + UPDATE_BUDGET(last - first); +// UPDATE_BUDGET((last - first) * _lg(last - first)); + _heapsort(ISAd, first, last - first); + for(a = last - 1; first < a; a = b) { + for(x = ISAd[*a], b = a - 1; (first <= b) && (ISAd[*b] == x); --b) { *b = ~*b; } + } + limit = -3; + continue; + } + + UPDATE_BUDGET(last - first); + + /* choose pivot */ + a = _pivot(ISAd, first, last); + SWAP(*first, *a); + v = ISAd[*first]; + + /* partition */ + for(b = first; (++b < last) && ((x = ISAd[*b]) == v);) { } + if(((a = b) < last) && (x < v)) { + for(; (++b < last) && ((x = ISAd[*b]) <= v);) { + if(x == v) { SWAP(*b, *a); ++a; } + } + } + for(c = last; (b < --c) && ((x = ISAd[*c]) == v);) { } + if((b < (d = c)) && (x > v)) { + for(; (b < --c) && ((x = ISAd[*c]) >= v);) { + if(x == v) { SWAP(*c, *d); --d; } + } + } + for(; b < c;) { + SWAP(*b, *c); + for(; (++b < c) && ((x = ISAd[*b]) <= v);) { + if(x == v) { SWAP(*b, *a); ++a; } + } + for(; (b < --c) && ((x = ISAd[*c]) >= v);) { + if(x == v) { SWAP(*c, *d); --d; } + } + } + + if(a <= d) { + c = b - 1; + + if((s = a - first) > (t = b - a)) { s = t; } + for(e = first, f = b - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); } + if((s = d - c) > (t = last - d - 1)) { s = t; } + for(e = b, f = last - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); } + + a = first + (b - a), b = last - (d - c); + next = (ISA[*a] != v) ? _lg(b - a) : -1; + + /* update ranks */ + for(c = first, v = a - SA - 1; c < a; ++c) { ISA[*c] = v; } + if(b < last) { for(c = a, v = b - SA - 1; c < b; ++c) { ISA[*c] = v; } } + + /* push */ + if((a - first) <= (last - b)) { + if((last - b) <= (b - a)) { + if(1 < (a - first)) { + STACK_PUSH(ISAd + 1, a, b, next); + STACK_PUSH(ISAd, b, last, limit); + last = a; + } else if(1 < (last - b)) { + STACK_PUSH(ISAd + 1, a, b, next); + first = b; + } else if(1 < (b - a)) { + ISAd += 1, first = a, last = b, limit = next; + } else { + STACK_POP(ISAd, first, last, limit); + } + } else if((a - first) <= (b - a)) { + if(1 < (a - first)) { + STACK_PUSH(ISAd, b, last, limit); + STACK_PUSH(ISAd + 1, a, b, next); + last = a; + } else if(1 < (b - a)) { + STACK_PUSH(ISAd, b, last, limit); + ISAd += 1, first = a, last = b, limit = next; + } else { + first = b; + } + } else { + if(1 < (b - a)) { + STACK_PUSH(ISAd, b, last, limit); + STACK_PUSH(ISAd, first, a, limit); + ISAd += 1, first = a, last = b, limit = next; + } else { + STACK_PUSH(ISAd, b, last, limit); + last = a; + } + } + } else { + if((a - first) <= (b - a)) { + if(1 < (last - b)) { + STACK_PUSH(ISAd + 1, a, b, next); + STACK_PUSH(ISAd, first, a, limit); + first = b; + } else if(1 < (a - first)) { + STACK_PUSH(ISAd + 1, a, b, next); + last = a; + } else if(1 < (b - a)) { + ISAd += 1, first = a, last = b, limit = next; + } else { + STACK_POP(ISAd, first, last, limit); + } + } else if((last - b) <= (b - a)) { + if(1 < (last - b)) { + STACK_PUSH(ISAd, first, a, limit); + STACK_PUSH(ISAd + 1, a, b, next); + first = b; + } else if(1 < (b - a)) { + STACK_PUSH(ISAd, first, a, limit); + ISAd += 1, first = a, last = b, limit = next; + } else { + last = a; + } + } else { + if(1 < (b - a)) { + STACK_PUSH(ISAd, first, a, limit); + STACK_PUSH(ISAd, b, last, limit); + ISAd += 1, first = a, last = b, limit = next; + } else { + STACK_PUSH(ISAd, first, a, limit); + first = b; + } + } + } + } else { + limit += 1, ISAd += 1; + } + } + + for(s = 0; s < ssize; ++s) { + if(stack[s].d == -3) { + _ls_updategroup(ISA, SA, stack[s].b, stack[s].c); + } + } +} + + +/*---------------------------------------------------------------------------*/ + +/*- Function -*/ + +/* Tandem repeat sort */ +void +trsort(saidx_t *ISA, saidx_t *SA, saidx_t n, saidx_t depth) { + saidx_t *first, *last; + saidx_t t, skip; + saidx_t budget; + saidx_t chance; + + if(-n < *SA) { + first = SA; + skip = 0; + budget = n; +/* chance = _lg(n); */ +/* chance = _lg(n) / 2 + 1; */ + chance = _lg(n) * 2 / 3 + 1; + do { + if((t = *first) < 0) { first -= t; skip += t; } + else { + skip = 0; + last = SA + ISA[t] + 1; + if(1 < (last - first)) { + _tr_introsort(ISA, ISA + depth, SA, first, last, &budget, &chance, n); + if(chance == 0) { + /* Switch to Larsson-Sadakane sorting algorithm. */ + if(SA < first) { *SA = -(first - SA); } + _lssort(ISA, SA, n, depth); + break; + } + } + first = last; + } + } while(first < (SA + n)); + } +} diff --git a/lib/utils.c b/lib/utils.c new file mode 100644 index 0000000..086b1ca --- /dev/null +++ b/lib/utils.c @@ -0,0 +1,378 @@ +/* + * utils.c for libdivsufsort + * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "divsufsort_private.h" + + +/*- Private Function -*/ + +/* Binary search for inverse bwt. */ +static +saidx_t +_binarysearch(const saidx_t *A, saidx_t len, saidx_t val) { + saidx_t half, m; + for(m = 0, half = len >> 1; + 0 < len; + len = half, half >>= 1) { + if(A[m + half] < val) { + m += half + 1; + half -= ((len & 1) == 0); + } + } + return m; +} + + +/*- Functions -*/ + +/* Burrows-Wheeler transform. */ +saint_t +bw_transform(const sauchar_t *T, sauchar_t *U, saidx_t *SA, + saidx_t n, saidx_t *idx) { + saidx_t *A, i, p; + saint_t c; + + /* Check arguments. */ + if((T == NULL) || (U == NULL) || (n < 0) || (idx == NULL)) { return -1; } + if(n <= 1) { + *idx = n; + return 0; + } + + if((A = SA) == NULL) { + i = divbwt(T, U, NULL, n); + if(0 <= i) { *idx = i; i = 0; } + return (saint_t)i; + } + + /* BW transform. */ + if(T == U) { + for(i = 0; 0 <= (p = A[i] - 1); ++i) { + c = T[i]; + U[i] = (i <= p) ? T[p] : A[p]; + A[i] = c; + } + *idx = i; + for( ; i < n; ++i) { + p = A[i + 1] - 1; + c = T[i]; + U[i] = (i <= p) ? T[p] : A[p]; + A[i] = c; + } + } else { + for(i = 0; A[i] != 0; ++i) { U[i] = T[A[i] - 1]; } + *idx = i; + for( ; i < n; ++i) { U[i] = T[A[i + 1] - 1]; } + } + + if(SA == NULL) { + /* Deallocate memory. */ + free(A); + } + + return 0; +} + +/* Inverse Burrows-Wheeler transform. */ +saint_t +inverse_bw_transform(const sauchar_t *T, sauchar_t *U, saidx_t *A, + saidx_t n, saidx_t idx) { + saidx_t C[ALPHABET_SIZE]; + saidx_t D[ALPHABET_SIZE]; + saidx_t *B; + saidx_t i, k, t, sum; + + /* Check arguments. */ + if((T == NULL) || (U == NULL) || (n < 0) || (idx < 0) || + (n < idx) || ((0 < n) && (idx == 0))) { + return -1; + } + if(n <= 1) { return 0; } + + if((B = A) == NULL) { + /* Allocate (n+1)*sizeof(saidx_t) bytes of memory. */ + if((B = malloc((n + 1) * sizeof(saidx_t))) == NULL) { return -2; } + } + + /* Inverse BW transform. */ + for(i = 0; i < ALPHABET_SIZE; ++i) { C[i] = 0; } + for(i = 0; i < n; ++i) { ++C[T[i]]; } + for(i = 0, sum = 0; i < ALPHABET_SIZE; ++i) { + t = C[i]; + C[i] = sum; + sum += t; + } + B[0] = idx; + for(i = 0; i < idx; ++i) { B[++C[T[i]]] = i; } + for( ; i < n; ++i) { B[++C[T[i]]] = i + 1; } + for(i = 0, k = 0, t = 0; i < ALPHABET_SIZE; ++i) { + if(t != C[i]) { + D[k] = i; + t = C[k] = C[i]; + ++k; + } + } + for(i = 0, t = 0; i < n; ++i) { + U[i] = D[_binarysearch(C, k, t = B[t])]; + } + + if(A == NULL) { + /* Deallocate memory. */ + free(B); + } + + return 0; +} + +/* Checks the suffix array SA of the string T. */ +saint_t +sufcheck(const sauchar_t *T, const saidx_t *SA, + saidx_t n, saint_t verbose) { + saidx_t C[ALPHABET_SIZE]; + saidx_t i = 0, p, t = 0; + saint_t c; + saint_t err = 0; + + if(1 <= verbose) { fprintf(stderr, "sufchecker: "); } + + /* Check arguments. */ + if((T == NULL) || (SA == NULL) || (n < 0)) { err = -1; } + + /* ranges. */ + if(err == 0) { + for(i = 0; i <= n; ++i) { + if((SA[i] < 0) || (n < SA[i])) { + err = -2; + break; + } + } + } + + /* first characters. */ + if(err == 0) { + for(i = 1; i < n; ++i) { + if(T[SA[i]] > T[SA[i + 1]]) { + err = -3; + break; + } + } + } + + /* suffixes. */ + if(err == 0) { + for(i = 0; i < ALPHABET_SIZE; ++i) { C[i] = 0; } + for(i = 0; i < n; ++i) { ++C[T[i]]; } + for(i = 0, p = 1; i < ALPHABET_SIZE; ++i) { + t = C[i]; + C[i] = p; + p += t; + } + + for(i = 0; i <= n; ++i) { + p = SA[i]; + if(0 < p) { + c = T[--p]; + t = C[c]; + } else { + p = n; + c = -1; + t = 0; + } + if(p != SA[t]) { + err = -4; + break; + } + if(0 <= c) { + ++C[c]; + if((n < C[c]) || (T[SA[C[c]]] != c)) { C[c] = -1; } + } + } + } + + if(1 <= verbose) { + if(err == 0) { + fprintf(stderr, "Done.\n"); + } else if(verbose == 1) { + fprintf(stderr, "Error.\n"); + } else if(err == -1) { + fprintf(stderr, "Invalid arguments.\n"); + } else if(err == -2) { + fprintf(stderr, "Out of the range [0,%d].\n SA[%d]=%d\n", + (int)n, (int)i, (int)SA[i]); + } else if(err == -3) { + fprintf(stderr, "Suffixes in wrong order.\n" + " T[SA[%d]=%d]=%d > T[SA[%d]=%d]=%d\n", + (int)i, (int)SA[i], (int)T[SA[i]], + (int)i + 1, (int)SA[i + 1], (int)T[SA[i + 1]]); + } else if(err == -4) { + fprintf(stderr, "Suffix in wrong position.\n"); + if(0 <= t) { fprintf(stderr, " SA[%d]=%d or\n", (int)t, (int)SA[t]); } + fprintf(stderr, " SA[%d]=%d\n", (int)i, (int)SA[i]); + } + } + + return err; +} + + +static +int +_compare(const sauchar_t *T, saidx_t Tsize, + const sauchar_t *P, saidx_t Psize, + saidx_t suf, saidx_t *match) { + saidx_t i, j; + saint_t r; + for(i = suf + *match, j = *match, r = 0; + (i < Tsize) && (j < Psize) && ((r = T[i] - P[j]) == 0); ++i, ++j) { } + *match = j; + return (r == 0) ? -(j != Psize) : r; +} + +/* Search for the pattern P in the string T. */ +saidx_t +sa_search(const sauchar_t *T, saidx_t Tsize, + const sauchar_t *P, saidx_t Psize, + const saidx_t *SA, saidx_t SAsize, + saidx_t *idx) { + saidx_t size, lsize, rsize, half; + saidx_t match, lmatch, rmatch; + saidx_t llmatch, lrmatch, rlmatch, rrmatch; + saidx_t i, j, k; + saint_t r; + + if(idx != NULL) { *idx = -1; } + if((T == NULL) || (P == NULL) || (SA == NULL) || + (Tsize < 0) || (Psize < 0) || (SAsize < 0)) { return -1; } + if((Tsize == 0) || (SAsize == 0)) { return 0; } + if(Psize == 0) { if(idx != NULL) { *idx = 0; } return SAsize; } + + for(i = j = k = 0, lmatch = rmatch = 0, size = SAsize, half = size >> 1; + 0 < size; + size = half, half >>= 1) { + match = MIN(lmatch, rmatch); + r = _compare(T, Tsize, P, Psize, SA[i + half], &match); + if(r < 0) { + i += half + 1; + half -= (size & 1) ^ 1; + lmatch = match; + } else if(r > 0) { + rmatch = match; + } else { + lsize = half, j = i, rsize = size - half - 1, k = i + half + 1; + + /* left part */ + for(llmatch = lmatch, lrmatch = match, half = lsize >> 1; + 0 < lsize; + lsize = half, half >>= 1) { + lmatch = MIN(llmatch, lrmatch); + r = _compare(T, Tsize, P, Psize, SA[j + half], &lmatch); + if(r < 0) { + j += half + 1; + half -= (lsize & 1) ^ 1; + llmatch = lmatch; + } else { + lrmatch = lmatch; + } + } + + /* right part */ + for(rlmatch = match, rrmatch = rmatch, half = rsize >> 1; + 0 < rsize; + rsize = half, half >>= 1) { + rmatch = MIN(rlmatch, rrmatch); + r = _compare(T, Tsize, P, Psize, SA[k + half], &rmatch); + if(r <= 0) { + k += half + 1; + half -= (rsize & 1) ^ 1; + rlmatch = rmatch; + } else { + rrmatch = rmatch; + } + } + + break; + } + } + + if(idx != NULL) { *idx = (0 < (k - j)) ? j : i; } + return k - j; +} + +/* Search for the character c in the string T. */ +saidx_t +sa_simplesearch(const sauchar_t *T, saidx_t Tsize, + const saidx_t *SA, saidx_t SAsize, + saint_t c, saidx_t *idx) { + saidx_t size, lsize, rsize, half; + saidx_t i, j, k, p; + saint_t r; + + if(idx != NULL) { *idx = -1; } + if((T == NULL) || (SA == NULL) || (Tsize < 0) || (SAsize < 0)) { return -1; } + if((Tsize == 0) || (SAsize == 0)) { return 0; } + + for(i = j = k = 0, size = SAsize, half = size >> 1; + 0 < size; + size = half, half >>= 1) { + p = SA[i + half]; + r = (p < Tsize) ? T[p] - c : -1; + if(r < 0) { + i += half + 1; + half -= (size & 1) ^ 1; + } else if(r == 0) { + lsize = half, j = i, rsize = size - half - 1, k = i + half + 1; + + /* left part */ + for(half = lsize >> 1; + 0 < lsize; + lsize = half, half >>= 1) { + p = SA[j + half]; + r = (p < Tsize) ? T[p] - c : -1; + if(r < 0) { + j += half + 1; + half -= (lsize & 1) ^ 1; + } + } + + /* right part */ + for(half = rsize >> 1; + 0 < rsize; + rsize = half, half >>= 1) { + p = SA[k + half]; + r = (p < Tsize) ? T[p] - c : -1; + if(r <= 0) { + k += half + 1; + half -= (rsize & 1) ^ 1; + } + } + + break; + } + } + + if(idx != NULL) { *idx = (0 < (k - j)) ? j : i; } + return k - j; +} |