v8
V8 is Google’s open source high-performance JavaScript and WebAssembly engine, written in C++.
Loading...
Searching...
No Matches
liveedit-diff.cc
Go to the documentation of this file.
1// Copyright 2022 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
6
7#include <cmath>
8#include <map>
9#include <optional>
10#include <vector>
11
12#include "src/base/logging.h"
13
14namespace v8 {
15namespace internal {
16
17namespace {
18
19// Implements Myer's Algorithm from
20// "An O(ND) Difference Algorithm and Its Variations", particularly the
21// linear space refinement mentioned in section 4b.
22//
23// The differ is input agnostic.
24//
25// The algorithm works by finding the shortest edit string (SES) in the edit
26// graph. The SES describes how to get from a string A of length N to a string
27// B of length M via deleting from A and inserting from B.
28//
29// Example: A = "abbaa", B = "abab"
30//
31// A
32//
33// a b b a a
34// o---o---o---o---o---o
35// a | \ | | | \ | \ |
36// o---o---o---o---o---o
37// b | | \ | \ | | |
38// B o---o---o---o---o---o
39// a | \ | | | \ | \ |
40// o---o---o---o---o---o
41// b | | \ | \ | | |
42// o---o---o---o---o---o
43//
44// The edit graph is constructed with the characters from string A on the x-axis
45// and the characters from string B on the y-axis. Starting from (0, 0) we can:
46//
47// - Move right, which is equivalent to deleting from A
48// - Move downwards, which is equivalent to inserting from B
49// - Move diagonally if the characters from string A and B match, which
50// means no insertion or deletion.
51//
52// Any path from (0, 0) to (N, M) describes a valid edit string, but we try to
53// find the path with the most diagonals, conversely that is the path with the
54// least insertions or deletions.
55// Note that a path with "D" insertions/deletions is called a D-path.
56class MyersDiffer {
57 private:
58 // A point in the edit graph.
59 struct Point {
60 int x, y;
61
62 // Less-than for a point in the edit graph is defined as less than in both
63 // components (i.e. at least one diagonal away).
64 bool operator<(const Point& other) const {
65 return x < other.x && y < other.y;
66 }
67 };
68
69 // Describes a rectangle in the edit graph.
70 struct EditGraphArea {
72
73 int width() const { return bottom_right.x - top_left.x; }
74 int height() const { return bottom_right.y - top_left.y; }
75 int size() const { return width() + height(); }
76 int delta() const { return width() - height(); }
77 };
78
79 // A path or path-segment through the edit graph. Not all points along
80 // the path are necessarily listed since it is trivial to figure out all
81 // the concrete points along a snake.
82 struct Path {
83 std::vector<Point> points;
84
85 void Add(const Point& p) { points.push_back(p); }
86 void Add(const Path& p) {
87 points.insert(points.end(), p.points.begin(), p.points.end());
88 }
89 };
90
91 // A snake is a path between two points that is either:
92 //
93 // - A single right or down move followed by a (possibly empty) list of
94 // diagonals (in the normal case).
95 // - A (possibly empty) list of diagonals followed by a single right or
96 // or down move (in the reverse case).
97 struct Snake {
98 Point from, to;
99 };
100
101 // A thin wrapper around std::vector<int> that allows negative indexing.
102 //
103 // This class stores the x-value of the furthest reaching path
104 // for each k-diagonal. k-diagonals are numbered from -M to N and defined
105 // by y(x) = x - k.
106 //
107 // We only store the x-value instead of the full point since we can
108 // calculate y via y = x - k.
109 class FurthestReaching {
110 public:
111 explicit FurthestReaching(std::vector<int>::size_type size) : v_(size) {}
112
113 int& operator[](int index) {
114 const size_t idx = index >= 0 ? index : v_.size() + index;
115 return v_[idx];
116 }
117
118 const int& operator[](int index) const {
119 const size_t idx = index >= 0 ? index : v_.size() + index;
120 return v_[idx];
121 }
122
123 private:
124 std::vector<int> v_;
125 };
126
127 class ResultWriter;
128
129 Comparator::Input* input_;
130 Comparator::Output* output_;
131
132 // Stores the x-value of the furthest reaching path for each k-diagonal.
133 // k-diagonals are numbered from '-height' to 'width', centered on (0,0) and
134 // are defined by y(x) = x - k.
135 FurthestReaching fr_forward_;
136
137 // Stores the x-value of the furthest reaching reverse path for each
138 // l-diagonal. l-diagonals are numbered from '-width' to 'height' and centered
139 // on 'bottom_right' of the edit graph area.
140 // k-diagonals and l-diagonals represent the same diagonals. While we refer to
141 // the diagonals as k-diagonals when calculating SES from (0,0), we refer to
142 // the diagonals as l-diagonals when calculating SES from (M,N).
143 // The corresponding k-diagonal name of an l-diagonal is: k = l + delta
144 // where delta = width -height.
145 FurthestReaching fr_reverse_;
146
147 MyersDiffer(Comparator::Input* input, Comparator::Output* output)
148 : input_(input),
149 output_(output),
150 fr_forward_(input->GetLength1() + input->GetLength2() + 1),
151 fr_reverse_(input->GetLength1() + input->GetLength2() + 1) {
152 // Length1 + Length2 + 1 is the upper bound for our work arrays.
153 // We allocate the work arrays once and reuse them for all invocations of
154 // `FindMiddleSnake`.
155 }
156
157 std::optional<Path> FindEditPath() {
158 return FindEditPath(Point{0, 0},
159 Point{input_->GetLength1(), input_->GetLength2()});
160 }
161
162 // Returns the path of the SES between `from` and `to`.
163 std::optional<Path> FindEditPath(Point from, Point to) {
164 // Divide the area described by `from` and `to` by finding the
165 // middle snake ...
166 std::optional<Snake> snake = FindMiddleSnake(from, to);
167
168 if (!snake) return std::nullopt;
169
170 // ... and then conquer the two resulting sub-areas.
171 std::optional<Path> head = FindEditPath(from, snake->from);
172 std::optional<Path> tail = FindEditPath(snake->to, to);
173
174 // Combine `head` and `tail` or use the snake start/end points for
175 // zero-size areas.
176 Path result;
177 if (head) {
178 result.Add(*head);
179 } else {
180 result.Add(snake->from);
181 }
182
183 if (tail) {
184 result.Add(*tail);
185 } else {
186 result.Add(snake->to);
187 }
188 return result;
189 }
190
191 // Returns the snake in the middle of the area described by `from` and `to`.
192 //
193 // Incrementally calculates the D-paths (starting from 'from') and the
194 // "reverse" D-paths (starting from 'to') until we find a "normal" and a
195 // "reverse" path that overlap. That is we first calculate the normal
196 // and reverse 0-path, then the normal and reverse 1-path and so on.
197 //
198 // If a step from a (d-1)-path to a d-path overlaps with a reverse path on
199 // the same diagonal (or the other way around), then we consider that step
200 // our middle snake and return it immediately.
201 std::optional<Snake> FindMiddleSnake(Point from, Point to) {
202 EditGraphArea area{from, to};
203 if (area.size() == 0) return std::nullopt;
204
205 // Initialise the furthest reaching vectors with an "artificial" edge
206 // from (0, -1) -> (0, 0) and (N, -M) -> (N, M) to serve as the initial
207 // snake when d = 0.
208 fr_forward_[1] = area.top_left.x;
209 fr_reverse_[-1] = area.bottom_right.x;
210
211 for (int d = 0; d <= std::ceil(area.size() / 2.0f); ++d) {
212 if (auto snake = ShortestEditForward(area, d)) return snake;
213 if (auto snake = ShortestEditReverse(area, d)) return snake;
214 }
215
216 return std::nullopt;
217 }
218
219 // Greedily calculates the furthest reaching `d`-paths for each k-diagonal
220 // where k is in [-d, d]. For each k-diagonal we look at the furthest
221 // reaching `d-1`-path on the `k-1` and `k+1` depending on which is further
222 // along the x-axis we either add an insertion from the `k+1`-diagonal or
223 // a deletion from the `k-1`-diagonal. Then we follow all possible diagonal
224 // moves and finally record the result as the furthest reaching path on the
225 // k-diagonal.
226 std::optional<Snake> ShortestEditForward(const EditGraphArea& area, int d) {
227 Point from, to;
228 // We alternate between looking at odd and even k-diagonals. That is
229 // because when we extend a `d-path` by a single move we can at most move
230 // one diagonal over. That is either move from `k-1` to `k` or from `k+1` to
231 // `k`. That is if `d` is even (odd) then we require only the odd (even)
232 // k-diagonals calculated in step `d-1`.
233 for (int k = -d; k <= d; k += 2) {
234 if (k == -d || (k != d && fr_forward_[k - 1] < fr_forward_[k + 1])) {
235 // Move downwards, i.e. add an insertion, because either we are at the
236 // edge and downwards is the only way we can move, or because the
237 // `d-1`-path along the `k+1` diagonal reaches further on the x-axis
238 // than the `d-1`-path along the `k-1` diagonal.
239 from.x = to.x = fr_forward_[k + 1];
240 } else {
241 // Move right, i.e. add a deletion.
242 from.x = fr_forward_[k - 1];
243 to.x = from.x + 1;
244 }
245
246 // Calculate y via y = x - k. We need to adjust k though since the k=0
247 // diagonal is centered on `area.top_left` and not (0, 0).
248 to.y = area.top_left.y + (to.x - area.top_left.x) - k;
249 from.y = (d == 0 || from.x != to.x) ? to.y : to.y - 1;
250
251 // Extend the snake diagonally as long as we can.
252 while (to < area.bottom_right && input_->Equals(to.x, to.y)) {
253 ++to.x;
254 ++to.y;
255 }
256
257 fr_forward_[k] = to.x;
258
259 // Check whether there is a reverse path on this k-diagonal which we
260 // are overlapping with. If yes, that is our snake.
261 const bool odd = area.delta() % 2 != 0;
262 const int l = k - area.delta();
263 if (odd && l >= (-d + 1) && l <= d - 1 && to.x >= fr_reverse_[l]) {
264 return Snake{from, to};
265 }
266 }
267 return std::nullopt;
268 }
269
270 // Greedily calculates the furthest reaching reverse `d`-paths for each
271 // l-diagonal where l is in [-d, d].
272 // Works the same as `ShortestEditForward` but we move upwards and left
273 // instead.
274 std::optional<Snake> ShortestEditReverse(const EditGraphArea& area, int d) {
275 Point from, to;
276 // We alternate between looking at odd and even l-diagonals. That is
277 // because when we extend a `d-path` by a single move we can at most move
278 // one diagonal over. That is either move from `l-1` to `l` or from `l+1` to
279 // `l`. That is if `d` is even (odd) then we require only the odd (even)
280 // l-diagonals calculated in step `d-1`.
281 for (int l = d; l >= -d; l -= 2) {
282 if (l == d || (l != -d && fr_reverse_[l - 1] > fr_reverse_[l + 1])) {
283 // Move upwards, i.e. add an insertion, because either we are at the
284 // edge and upwards is the only way we can move, or because the
285 // `d-1`-path along the `l-1` diagonal reaches further on the x-axis
286 // than the `d-1`-path along the `l+1` diagonal.
287 from.x = to.x = fr_reverse_[l - 1];
288 } else {
289 // Move left, i.e. add a deletion.
290 from.x = fr_reverse_[l + 1];
291 to.x = from.x - 1;
292 }
293
294 // Calculate y via y = x - k. We need to adjust k though since the k=0
295 // diagonal is centered on `area.top_left` and not (0, 0).
296 const int k = l + area.delta();
297 to.y = area.top_left.y + (to.x - area.top_left.x) - k;
298 from.y = (d == 0 || from.x != to.x) ? to.y : to.y + 1;
299
300 // Extend the snake diagonally as long as we can.
301 while (area.top_left < to && input_->Equals(to.x - 1, to.y - 1)) {
302 --to.x;
303 --to.y;
304 }
305
306 fr_reverse_[l] = to.x;
307
308 // Check whether there is a path on this k-diagonal which we
309 // are overlapping with. If yes, that is our snake.
310 const bool even = area.delta() % 2 == 0;
311 if (even && k >= -d && k <= d && to.x <= fr_forward_[k]) {
312 // Invert the points so the snake goes left to right, top to bottom.
313 return Snake{to, from};
314 }
315 }
316 return std::nullopt;
317 }
318
319 // Small helper class that converts a "shortest edit script" path into a
320 // source mapping. The result is a list of "chunks" where each "chunk"
321 // describes a range in the input string and where it can now be found
322 // in the output string.
323 //
324 // The list of chunks can be calculated in a simple pass over all the points
325 // of the edit path:
326 //
327 // - For any diagonal we close and report the current chunk if there is
328 // one open at the moment.
329 // - For an insertion or deletion we open a new chunk if none is ongoing.
330 class ResultWriter {
331 public:
332 explicit ResultWriter(Comparator::Output* output) : output_(output) {}
333
334 void RecordNoModification(const Point& from) {
335 if (!change_is_ongoing_) return;
336
337 // We close the current chunk, going from `change_start_` to `from`.
338 CHECK(change_start_);
339 output_->AddChunk(change_start_->x, change_start_->y,
340 from.x - change_start_->x, from.y - change_start_->y);
341 change_is_ongoing_ = false;
342 }
343
344 void RecordInsertionOrDeletion(const Point& from) {
345 if (change_is_ongoing_) return;
346
347 // We start a new chunk beginning at `from`.
349 change_is_ongoing_ = true;
350 }
351
352 private:
353 Comparator::Output* output_;
354 bool change_is_ongoing_ = false;
355 std::optional<Point> change_start_;
356 };
357
358 // Takes an edit path and "fills in the blanks". That is we notify the
359 // `ResultWriter` after each single downwards, left or diagonal move.
360 void WriteResult(const Path& path) {
361 ResultWriter writer(output_);
362
363 for (size_t i = 1; i < path.points.size(); ++i) {
364 Point p1 = path.points[i - 1];
365 Point p2 = path.points[i];
366
367 p1 = WalkDiagonal(writer, p1, p2);
368 const int cmp = (p2.x - p1.x) - (p2.y - p1.y);
369 if (cmp == -1) {
370 writer.RecordInsertionOrDeletion(p1);
371 p1.y++;
372 } else if (cmp == 1) {
373 writer.RecordInsertionOrDeletion(p1);
374 p1.x++;
375 }
376
377 p1 = WalkDiagonal(writer, p1, p2);
378 DCHECK(p1.x == p2.x && p1.y == p2.y);
379 }
380
381 // Write one diagonal in the end to flush out any open chunk.
382 writer.RecordNoModification(path.points.back());
383 }
384
385 Point WalkDiagonal(ResultWriter& writer, Point p1, Point p2) {
386 while (p1.x < p2.x && p1.y < p2.y && input_->Equals(p1.x, p1.y)) {
387 writer.RecordNoModification(p1);
388 p1.x++;
389 p1.y++;
390 }
391 return p1;
392 }
393
394 public:
395 static void MyersDiff(Comparator::Input* input, Comparator::Output* output) {
396 MyersDiffer differ(input, output);
397 auto result = differ.FindEditPath();
398 if (!result) return; // Empty input doesn't produce a path.
399
400 differ.WriteResult(*result);
401 }
402};
403
404} // namespace
405
407 Comparator::Output* result_writer) {
408 MyersDiffer::MyersDiff(input, result_writer);
409}
410
411} // namespace internal
412} // namespace v8
static void CalculateDifference(Input *input, Output *result_writer)
XMMRegister const input_
ZoneVector< RpoNumber > & result
Point from
FurthestReaching fr_forward_
Point bottom_right
bool change_is_ongoing_
int x
std::optional< Point > change_start_
std::vector< Point > points
Comparator::Output * output_
FurthestReaching fr_reverse_
std::vector< int > v_
Point top_left
Point to
void Add(RWDigits Z, Digits X, Digits Y)
V8_INLINE constexpr bool operator<(Builtin a, Builtin b)
Definition builtins.h:75
#define CHECK(condition)
Definition logging.h:124
#define DCHECK(condition)
Definition logging.h:482