// Copyright 2016 Google Inc. All rights reserved // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // http://ci.nii.ac.jp/naid/110006613087 #include "lcp_msort.h" #include #include "strutil.h" #define DO_SORT_AND_UNIQ_AT_ONCE namespace { struct AnnotatedString { const unsigned char* s; int l; }; inline int LcpCompare(const unsigned char* s1, const unsigned char* s2, int b, int& i) { for (i = b; s1[i] || s2[i]; i++) { unsigned char c1 = s1[i]; unsigned char c2 = s2[i]; if (c1 < c2) { return -1; } else if (c1 > c2) { return 1; } } return 0; } #ifdef DO_SORT_AND_UNIQ_AT_ONCE int StringMergeSortAndUniq(const vector& data, int begin, int end, AnnotatedString* work, AnnotatedString* out) { if (begin + 1 == end) { out[begin] = AnnotatedString{ data[begin], 0 }; return 1; } int mid = (begin + end) / 2; int s1_len = StringMergeSortAndUniq(data, begin, mid, work, out); int s2_len = StringMergeSortAndUniq(data, mid, end, work, out); memcpy(work, out + begin, s1_len * sizeof(AnnotatedString)); AnnotatedString* s1 = work; AnnotatedString* s2 = out + mid; int i = 0, j = 0, k = 0; AnnotatedString* d = out + begin; for (; i < s1_len && j < s2_len; k++) { if (s1[i].l > s2[j].l) { d[k] = s1[i]; i++; } else if (s1[i].l < s2[j].l) { d[k] = s2[j]; j++; } else { int m; int r = LcpCompare(s1[i].s, s2[j].s, s1[i].l, m); if (r < 0) { d[k] = s1[i]; i++; s2[j].l = m; } else if (r > 0) { d[k] = s2[j]; j++; s1[i].l = m; } else { // Discard a unique string. d[k] = s1[i]; i++; j++; } } } if (i < s1_len) { memcpy(&d[k], &s1[i], (s1_len - i) * sizeof(AnnotatedString)); k += s1_len - i; } else if (j < s2_len) { memcpy(&d[k], &s2[j], (s2_len - j) * sizeof(AnnotatedString)); k += s2_len - j; } return k; } #else void StringMergeSort(const vector& data, int begin, int end, AnnotatedString* work, AnnotatedString* out) { if (begin + 1 == end) { out[begin] = AnnotatedString{ data[begin], 0 }; return; } int mid = (begin + end) / 2; StringMergeSort(data, begin, mid, work, out); StringMergeSort(data, mid, end, work, out); memcpy(work, out + begin, (mid - begin) * sizeof(AnnotatedString)); AnnotatedString* s1 = work; int s1_len = mid - begin; AnnotatedString* s2 = out + mid; int s2_len = end - mid; int i = 0, j = 0, k = 0; AnnotatedString* d = out + begin; for (; i < s1_len && j < s2_len; k++) { if (s1[i].l > s2[j].l) { d[k] = s1[i]; i++; } else if (s1[i].l < s2[j].l) { d[k] = s2[j]; j++; } else { int m; int r = LcpCompare(s1[i].s, s2[j].s, s1[i].l, m); if (r < 0) { d[k] = s1[i]; i++; s2[j].l = m; } else { d[k] = s2[j]; j++; s1[i].l = m; } } } if (i < s1_len) memcpy(&d[k], &s1[i], (s1_len - i) * sizeof(AnnotatedString)); else if (j < s2_len) memcpy(&d[k], &s2[j], (s2_len - j) * sizeof(AnnotatedString)); } #endif } // namespace void StringSortByLcpMsort(string* buf, string* out) { vector toks; for (StringPiece tok : WordScanner(*buf)) { const_cast(tok.data())[tok.size()] = 0; toks.push_back(reinterpret_cast(tok.data())); } if (toks.empty()) return; if (toks.size() == 1) { *out += reinterpret_cast(toks[0]); return; } AnnotatedString* as = static_cast( malloc(toks.size() * sizeof(AnnotatedString))); AnnotatedString* work = static_cast( malloc((toks.size() / 2 + 1) * sizeof(AnnotatedString))); #ifdef DO_SORT_AND_UNIQ_AT_ONCE int len = StringMergeSortAndUniq(toks, 0, toks.size(), work, as); #else int len = static_cast(toks.size()); StringMergeSort(toks, 0, toks.size(), work, as); #endif WordWriter ww(out); #ifdef DO_SORT_AND_UNIQ_AT_ONCE for (int i = 0; i < len; i++) { ww.MaybeAddWhitespace(); *out += reinterpret_cast(as[i].s); } #else StringPiece prev; for (int i = 0; i < len; i++) { StringPiece tok = reinterpret_cast(as[i].s); if (prev != tok) { ww.Write(tok); prev = tok; } } #endif free(as); free(work); }