aboutsummaryrefslogtreecommitdiffstats
path: root/tools/java/org/unicode/cldr/util/StringRange.java
blob: c846f7354b79b446d1f94df82dee9b5761f67726 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
package org.unicode.cldr.util;

import java.util.Collection;
import java.util.Comparator;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;

import com.ibm.icu.impl.Relation;
import com.ibm.icu.lang.CharSequences;
import com.ibm.icu.util.ICUException;

@SuppressWarnings("deprecation")
public class StringRange {
    private static final boolean DEBUG = false;

    public interface Adder {
        /**
         * @param start
         * @param end   may be null, for adding single string
         */
        void add(String start, String end);
    }

    public static final Comparator<int[]> COMPARE_INT_ARRAYS = new Comparator<int[]>() {
        @Override
        public int compare(int[] o1, int[] o2) {
            int minIndex = Math.min(o1.length, o2.length);
            for (int i = 0; i < minIndex; ++i) {
                int diff = o1[i] - o2[i];
                if (diff != 0) {
                    return diff;
                }
            }
            return o1.length - o2.length;
        }
    };

    /**
     * Compact the set of strings.
     * @param source
     * @param adder adds each pair to the output. See the {@link Adder} interface.
     * @param shorterPairs use abc-d instead of abc-abd
     * @param moreCompact use a more compact form, at the expense of more processing. If false, source must be sorted.
     */
    public static void compact(Set<String> source, Adder adder, boolean shorterPairs, boolean moreCompact) {
        if (!moreCompact) {
            String start = null;
            String end = null;
            int lastCp = 0;
            int prefixLen = 0;
            for (String s : source) {
                if (start != null) { // We have something queued up
                    if (s.regionMatches(0, start, 0, prefixLen)) {
                        int currentCp = s.codePointAt(prefixLen);
                        if (currentCp == 1 + lastCp && s.length() == prefixLen + Character.charCount(currentCp)) {
                            end = s;
                            lastCp = currentCp;
                            continue;
                        }
                    }
                    // We failed to find continuation. Add what we have and restart
                    adder.add(start, end == null ? null
                        : !shorterPairs ? end
                            : end.substring(prefixLen, end.length()));
                }
                // new possible range
                start = s;
                end = null;
                lastCp = s.codePointBefore(s.length());
                prefixLen = s.length() - Character.charCount(lastCp);
            }
            adder.add(start, end == null ? null
                : !shorterPairs ? end
                    : end.substring(prefixLen, end.length()));
        } else {
            // not a fast algorithm, but ok for now
            // TODO rewire to use the first (slower) algorithm to generate the ranges, then compact them from there.
            // first sort by lengths
            Relation<Integer, Ranges> lengthToArrays = Relation.of(new TreeMap<Integer, Set<Ranges>>(), TreeSet.class);
            for (String s : source) {
                Ranges item = new Ranges(s);
                lengthToArrays.put(item.size(), item);
            }
            // then compact items of each length and emit compacted sets
            for (Entry<Integer, Set<Ranges>> entry : lengthToArrays.keyValuesSet()) {
                LinkedList<Ranges> compacted = compact(entry.getKey(), entry.getValue());
                for (Ranges ranges : compacted) {
                    adder.add(ranges.start(), ranges.end(shorterPairs));
                }
            }
        }
    }

    /**
     * Faster but not as good compaction. Only looks at final codepoint.
     * @param source
     * @param adder
     * @param shorterPairs
     */
    public static void compact(Set<String> source, Adder adder, boolean shorterPairs) {
        compact(source, adder, shorterPairs, false);
    }

    private static LinkedList<Ranges> compact(int size, Set<Ranges> inputRanges) {
        LinkedList<Ranges> ranges = new LinkedList<>(inputRanges);
        for (int i = size - 1; i >= 0; --i) {
            Ranges last = null;
            for (Iterator<Ranges> it = ranges.iterator(); it.hasNext();) {
                Ranges item = it.next();
                if (last == null) {
                    last = item;
                } else if (last.merge(i, item)) {
                    it.remove();
                } else {
                    last = item; // go to next
                }
            }
        }
        ;
        return ranges;
    }

    static final class Range implements Comparable<Range> {
        int min;
        int max;

        public Range(int min, int max) {
            this.min = min;
            this.max = max;
        }

        @Override
        public boolean equals(Object obj) {
            return compareTo((Range) obj) == 0;
        }

        @Override
        public int compareTo(Range that) {
            int diff = min - that.min;
            if (diff != 0) {
                return diff;
            }
            return max - that.max;
        }

        @Override
        public int hashCode() {
            return min * 37 + max;
        }

        @Override
        public String toString() {
            StringBuilder result = new StringBuilder().appendCodePoint(min);
            return min == max ? result.toString() : result.append('~').appendCodePoint(max).toString();
        }
    }

    static final class Ranges implements Comparable<Ranges> {
        private final Range[] ranges;

        public Ranges(String s) {
            int[] array = CharSequences.codePoints(s);
            ranges = new Range[array.length];
            for (int i = 0; i < array.length; ++i) {
                ranges[i] = new Range(array[i], array[i]);
            }
        }

        public boolean merge(int pivot, Ranges other) {
//            if (this.toString().equals("afz")) {
//                int debug = 0;
//            }
            // we will merge items if the pivot is adjacent, and all other ranges are equal
            for (int i = ranges.length - 1; i >= 0; --i) {
                if (i == pivot) {
                    if (ranges[i].max != other.ranges[i].min - 1) { // not adjacent
                        return false;
                    }
                } else {
                    if (!ranges[i].equals(other.ranges[i])) {
                        return false;
                    }
                }
            }
            if (DEBUG) System.out.print("Merging: " + this + ", " + other);
            ranges[pivot].max = other.ranges[pivot].max;
            if (DEBUG) System.out.println(" => " + this);
            return true;
        }

        public String start() {
            StringBuilder result = new StringBuilder();
            for (int i = 0; i < ranges.length; ++i) {
                result.appendCodePoint(ranges[i].min);
            }
            return result.toString();
        }

        public String end(boolean mostCompact) {
            int firstDiff = firstDifference();
            if (firstDiff == ranges.length) {
                return null;
            }
            StringBuilder result = new StringBuilder();
            for (int i = mostCompact ? firstDiff : 0; i < ranges.length; ++i) {
                result.appendCodePoint(ranges[i].max);
            }
            return result.toString();
        }

        public int firstDifference() {
            for (int i = 0; i < ranges.length; ++i) {
                if (ranges[i].min != ranges[i].max) {
                    return i;
                }
            }
            return ranges.length;
        }

        public Integer size() {
            return ranges.length;
        }

        @Override
        public int compareTo(Ranges other) {
            int diff = ranges.length - other.ranges.length;
            if (diff != 0) {
                return diff;
            }
            for (int i = 0; i < ranges.length; ++i) {
                diff = ranges[i].compareTo(other.ranges[i]);
                if (diff != 0) {
                    return diff;
                }
            }
            return 0;
        }

        @Override
        public String toString() {
            String start = start();
            String end = end(false);
            return end == null ? start : start + "~" + end;
        }
    }

    static Collection<String> expand(String start, String end, Collection<String> output) {
        int[] startCps = CharSequences.codePoints(start);
        int[] endCps = CharSequences.codePoints(end);
        int startOffset = startCps.length - endCps.length;

        if (startOffset < 0) {
            throw new ICUException("Must have start-length ≥ end-length");
        } else if (endCps.length == 0) {
            throw new ICUException("Must have end-length > 0");
        }

        StringBuilder builder = new StringBuilder();
        for (int i = 0; i < startOffset; ++i) {
            builder.appendCodePoint(startCps[i]);
        }
        add(0, startOffset, startCps, endCps, builder, output);
        return output;
    }

    private static void add(int endIndex, int startOffset, int[] starts, int[] ends, StringBuilder builder, Collection<String> output) {
        int start = starts[endIndex + startOffset];
        int end = ends[endIndex];
        if (start > end) {
            throw new ICUException("Each two corresponding characters ...x... and ...y... must have x ≤ y.");
        }
        boolean last = endIndex == ends.length - 1;
        int startLen = builder.length();
        for (int i = start; i <= end; ++i) {
            builder.appendCodePoint(i);
            if (last) {
                output.add(builder.toString());
            } else {
                add(endIndex + 1, startOffset, starts, ends, builder, output);
            }
            builder.setLength(startLen);
        }
    }
}