Browse Source

Experimental rewrite of grotty to html conversion in Rust

The previous C code was troublesome.
- Didn't handle long lines
- I couldn't convince myself that it was free of memory safety issues
- Needed improving anyway, there are some formatting bugs. These are
  hard to fix in the current code.

I mostly replicated the formatting bugs of the old C implementation in
Rust, and possibly added a few new bugs as well. It's not a significant
improvement right now, more testing and fixing will be needed.

The performance of both implementations is comparable, with the Rust
version being slightly faster in many cases (and slower in some others).
I did spend more time trying to optimize this Rust version than I did
with the old C code. I initially tried a naive-ish conversion of the C
code to Rust, but that turned out to be much slower and I had to resort
to using regexes and different data structures fix that.
Yorhel 1 year ago
parent
commit
6114b17389
10 changed files with 533 additions and 299 deletions
  1. 1
    1
      .gitignore
  2. 12
    3
      Makefile
  3. 3
    3
      README
  4. 2
    0
      lib/ManUtils/Build.PL
  5. 6
    7
      lib/ManUtils/ManUtils.pm
  6. 13
    285
      lib/ManUtils/ManUtils.xs
  7. 121
    0
      web/Cargo.lock
  8. 16
    0
      web/Cargo.toml
  9. 348
    0
      web/src/lib.rs
  10. 11
    0
      web/src/main.rs

+ 1
- 1
.gitignore View File

@@ -3,4 +3,4 @@
3 3
 !/lib/ManUtils/ManUtils.pm
4 4
 !/lib/ManUtils/ManUtils.xs
5 5
 indexer/target
6
-
6
+web/target

+ 12
- 3
Makefile View File

@@ -2,18 +2,27 @@
2 2
 
3 3
 all: ManUtils indexer
4 4
 
5
-ManUtils: lib/ManUtils/Build
5
+
6
+ManUtils: lib/ManUtils/inst/lib/perl5/x86_64-linux/ManUtils.pm
7
+
8
+lib/ManUtils/inst/lib/perl5/x86_64-linux/ManUtils.pm: lib/ManUtils/Build.PL lib/ManUtils/ManUtils.pm lib/ManUtils/ManUtils.xs web/target/release/libweb.a
9
+	test lib/ManUtils/ManUtils.xs -ot web/target/release/libweb.a && touch -r web/target/release/libweb.a lib/ManUtils/ManUtils.xs
6 10
 	cd lib/ManUtils && perl Build.PL && ./Build install --install-base=inst
11
+	touch lib/ManUtils/inst/lib/perl5/x86_64-linux/ManUtils.pm
12
+
13
+web/target/release/libweb.a: web/Cargo.toml web/src/*.rs
14
+	cd web && cargo build --release
15
+	#strip --strip-unneeded web/target/release/libweb.a
7 16
 
8
-lib/ManUtils/Build: lib/ManUtils/Build.PL
9
-	cd lib/ManUtils && perl Build.PL
10 17
 
11 18
 indexer: indexer/target/release/indexer
12 19
 
13 20
 indexer/target/release/indexer: indexer/Cargo.toml indexer/src/*.rs
14 21
 	cd indexer && cargo build --release
15 22
 
23
+
16 24
 clean:
17 25
 	cd lib/ManUtils && ./Build distclean
18 26
 	rm -rf lib/ManUtils/inst
19 27
 	cd indexer && cargo clean
28
+	cd web && cargo clean

+ 3
- 3
README View File

@@ -13,10 +13,11 @@ Requirements
13 13
   General:
14 14
     perl: A somewhat recent version (no idea which, due to my XS usage)
15 15
     postgresql: Also a somewhat recent version
16
+    rust + cargo (1.13+)
17
+
18
+  www/ & lib/ & webs/: (Website)
16 19
     DBI
17 20
     DBD::Pg
18
-
19
-  www/: (Website)
20 21
     TUWF
21 22
     JSON::XS
22 23
     AnyEvent
@@ -24,7 +25,6 @@ Requirements
24 25
   util/ & indexer/: (DB updating and package synchronisation stuff)
25 26
     curl
26 27
     psql
27
-    cargo + rust (1.13+)
28 28
 
29 29
 
30 30
 Contact

+ 2
- 0
lib/ManUtils/Build.PL View File

@@ -6,6 +6,8 @@ Module::Build->new(
6 6
   dist_name => 'ManUtils',
7 7
   dist_version_from => 'ManUtils.pm',
8 8
   dist_abstract => 'Utils for manned.org',
9
+  license => 'MIT',
10
+  extra_linker_flags => '../../web/target/release/libweb.a',
9 11
   pm_files => {
10 12
     'ManUtils.pm' => 'lib/ManUtils.pm',
11 13
   },

+ 6
- 7
lib/ManUtils/ManUtils.pm View File

@@ -32,11 +32,12 @@ sub fmt {
32 32
   # Other .so's should be handled by html()
33 33
   $input =~ s/^\.so (.+)$/.in -10\n.sp\n\[\[\[MANNEDINCLUDE$1\]\]\]/mg;
34 34
 
35
-  # Disable hyphenation, since that screws up man page references. :-(
36
-  $input = ".hy 0\n.de hy\n..\n$input";
37
-
38
-  # Emulate man-db's --nj option
39
-  $input = ".na\n.de ad\n..\n$input";
35
+  $input =
36
+    # Disable hyphenation, since that screws up man page references. :-(
37
+     ".hy 0\n.de hy\n..\n"
38
+    # Emulate man-db's --nj option
39
+    .".na\n.de ad\n..\n"
40
+    .$input;
40 41
 
41 42
   $input = encode_utf8($input);
42 43
 
@@ -100,6 +101,4 @@ sub fmt_block {
100 101
   $out;
101 102
 }
102 103
 
103
-
104 104
 1;
105
-

+ 13
- 285
lib/ManUtils/ManUtils.xs View File

@@ -2,278 +2,13 @@
2 2
 #include "perl.h"
3 3
 #include "XSUB.h"
4 4
 
5
+struct StringWrap {
6
+  char *buf;
7
+  unsigned long long len, cap;
8
+};
5 9
 
6
-// Convert grotty output to HTML for use in a <pre> tag.
7
-// It is assumed that the given input string is valid UTF-8, either represented
8
-// as a Perl Unicode string, or as a UTF-8 encoded byte string. The data may
9
-// not contain the 0 character.
10
-// The formatted HTML is returned as a Perl Unicode string.
11
-// It is also assumed that hyphenation has been disabled when generating the
12
-// grotty output.
13
-
14
-
15
-// This implementation really is fast enough for "real-time" use in the website
16
-// code, very much unlike my experiments with Perl. My previous Perl
17
-// implementation took about 1.5s for rsync(1), whereas I've not seen this
18
-// implementation take more than 15ms.
19
-
20
-// TODO: Unicode characters aren't truncated correctly when a line exceeds
21
-// MAXLINE bytes. I've only seen this happening on man pages that grotty
22
-// couldn't wrap, e.g. some Japanese and Chinese mans.
23
-// (Ideally, I'd tell grotty how to wrap those correctly)
24
-
25
-#include <stdio.h>
26
-#include <stdlib.h>
27
-#include <string.h>
28
-#include <ctype.h>
29
-
30
-#define MAXLINE 1024
31
-
32
-#define LB 1
33
-#define LI 2
34
-
35
-typedef struct ctx_t {
36
-  const char *src; // Pointer to the source data, or what's left of it.
37
-  SV *dest; // Destination string to write to.
38
-
39
-  // Current line
40
-  char line[MAXLINE];
41
-  char flags[MAXLINE]; // 0 = no fmt, LB = bold, LI = italic. (No combinations allowed)
42
-  int linelen;
43
-  int noref; // 1 if the current line shouldn't be checked for references. (Used for first and last line)
44
-} ctx_t;
45
-
46
-
47
-
48
-// Escapes and appends a displayed character to the output string.
49
-static inline void flushescape(ctx_t *x, char c) {
50
-  static char str[2] = {};
51
-  // Most HTML-escape functions also escape " to &quot;, but since we aren't
52
-  // going to put a man page in an XML attribute, we don't really have to worry
53
-  // about that one.
54
-  switch(c) {
55
-    case '>': sv_catpvn(x->dest, "&gt;", 4); break;
56
-    case '<': sv_catpvn(x->dest, "&lt;", 4); break;
57
-    case '&': sv_catpvn(x->dest, "&amp;", 5); break;
58
-    default:
59
-      str[0] = c;
60
-      sv_catpvn(x->dest, str, 1);
61
-  }
62
-}
63
-
64
-
65
-// HTML-escapes and adds formatting tags to a certain chunk of data and appends
66
-// it to the output string. The chunk is considered as an individual part,
67
-// assuming that any formatting is disabled at the start of the chunk, and
68
-// making sure it is disabled again at the end.
69
-// e points to the last character in s that is not considered part of the chunk.
70
-static void flushchunk(ctx_t *x, const char *s, const char *f, const char *e) {
71
-  int fmt = 0;
72
-
73
-#define EFMT if(fmt) sv_catpvn(x->dest, fmt == LB ? "</b>" : "</i>", 4)
74
-
75
-  while(s != e) {
76
-    // Consider underscore and whitespace to have the same formatting as the
77
-    // previous character.  The grotty escape sequences don't work well for the
78
-    // underscore character, and you can't see the difference either way.
79
-    if(fmt != *f && *s != '_' && *s != ' ') {
80
-      EFMT;
81
-      fmt = *f;
82
-      if(fmt)
83
-        sv_catpvn(x->dest, fmt == LB ? "<b>" : "<i>", 3);
84
-    }
85
-    flushescape(x, *s);
86
-    s++;
87
-    f++;
88
-  }
89
-  EFMT;
90
-
91
-#undef EFMT
92
-}
93
-
94
-
95
-#define ismanchar(x) (isalnum(x) || x == '_' || x == '-' || x == '.')
96
-
97
-
98
-static void flushinclude(ctx_t *x) {
99
-  char buf[8] = {};
100
-  char *s = x->line;
101
-
102
-  s[x->linelen-3] = 0;
103
-  s += 16;
104
-  char *fn = strrchr(s, '/');
105
-  fn = fn ? fn+1 : s;
106
-  sv_catpv(x->dest, "&gt;&gt; Included manual page: <a href=\"/");
107
-
108
-  // Replace ‐ (U+2010) with - (U+2d). ASCII dashes are replaced with an
109
-  // Unicode dash when passed through groff, which we need to revert in order
110
-  // to get the link working. (Apparently it recognizes man page references and
111
-  // URLs, as it doesn't do this replacement in those situations.)
112
-  while(*fn) {
113
-    if(*fn == (char)0xe2 && fn[1] == (char)0x80 && fn[2] == (char)0x90) {
114
-      buf[0] = '-';
115
-      fn += 3;
116
-    } else {
117
-      buf[0] = *fn;
118
-      fn++;
119
-    }
120
-    sv_catpvn(x->dest, buf, 1);
121
-  }
122
-
123
-  sv_catpv(x->dest, "\">");
124
-  sv_catpv(x->dest, s);
125
-  sv_catpv(x->dest, "</a>");
126
-}
127
-
128
-
129
-// HTML-escapes and "Flushes" the current line to the output string. Tries to
130
-// convert man references and URLs into links if format is true.
131
-static void flushline(ctx_t *x) {
132
-  static const char eol[] = "\n";
133
-  char *s = x->line, *es = x->line;
134
-
135
-  // Special-case [[[MANNEDINCLUDE ..]]] directive
136
-  if(x->linelen > 20 && *s == '[' && strncmp(s, "[[[MANNEDINCLUDE", 16) == 0 && strcmp("]]]", s+x->linelen-3) == 0) {
137
-    flushinclude(x);
138
-    goto end;
139
-  }
140
-
141
-  if(x->noref) {
142
-    flushchunk(x, x->line, x->flags, x->line+x->linelen);
143
-    goto end;
144
-  }
145
-
146
-#define flush(end) do {\
147
-    flushchunk(x, es, x->flags+(es-x->line), end);\
148
-    es = end;\
149
-  } while(0)
150
-
151
-  while(*s) {
152
-    // Man page reference.
153
-    // Detected by the "(x)", but then checked backwards in the buffer to find
154
-    // the start of the reference. This is pretty fast. Fails on:
155
-    // - JSON.3pm: JSON->new->utf8(1)->pretty(1)->encode($perl_scalar)
156
-    if(*s == '(' && (('1' <= s[1] && s[1] <= '9') || s[1] == 'n') && s[2] == ')' && !isalnum(s[3])) {
157
-      char *n = s-1;
158
-      while(n >= es && ismanchar(*n))
159
-        n--;
160
-      if(++n < s) {
161
-        flush(n);
162
-        *s = 0;
163
-        sv_catpvf(x->dest, "<a href=\"/%s.%c\">%s(%c)</a>", n, s[1], n, s[1]);
164
-        s += 3;
165
-        es = s;
166
-        continue;
167
-      }
168
-    }
169
-
170
-    // HTTP(s) URL.
171
-    // This is just a simple q{https?://[^ ][.,;"\)>]?( |$)} match, doesn't
172
-    // always work right, e.g.:
173
-    // - https://manned.org/spu_run/414316a1 -> URL wrapped to new line
174
-    // Note: Don't use strncmp() before manually checking for 'http'. The parse
175
-    // time is otherwise increased by a factor 2.
176
-    if(s[0] == 'h' && s[1] == 't' && s[2] == 't' && s[3] == 'p' && (strncmp(s, "http://", 7) == 0 || strncmp(s, "https://", 8) == 0)) {
177
-      // Find the end of the URL (space or some other weird character).
178
-      char *sep = s;
179
-      while(*sep && *sep != '>' && *sep != '<' && *sep != ' ' && *sep != '"')
180
-        sep++;
181
-      char *sp = sep;
182
-      if(sp > s+10) {
183
-        flush(s);
184
-        char endchr = *sp;
185
-        *(sp--) = 0;
186
-        if(*sp == '.' || *sp == ',' || *sp == ';' || *sp == ')') {
187
-          sp[1] = endchr;
188
-          endchr = *sp;
189
-          *(sp--) = 0;
190
-        }
191
-        // Also catch a Unicode '⟩', which is how groff sometimes ends a .UR, e.g.:
192
-        // - https://manned.org/troff/c4467840
193
-        // - https://manned.org/pass/78413b49
194
-        // - https://manned.org/empathy-accounts/8c05b2c1
195
-        // - https://manned.org/urn/8cb83e85
196
-        // - https://manned.org/wine/4a699a22
197
-        if(*sp == '\xa9' && *(sp-1) == '\x9f' && *(sp-2) == '\xe2') {
198
-          sp[1] = endchr;
199
-          sp -= 3;
200
-          endchr = sp[1];
201
-          sp[1] = 0;
202
-        }
203
-        sv_catpvf(x->dest, "<a href=\"%s\" rel=\"nofollow\">%s</a>", s, s);
204
-        *(++sp) = endchr;
205
-        es = s = sp;
206
-        continue;
207
-      }
208
-    }
209
-    s++;
210
-  }
211
-
212
-  flush(s);
213
-#undef flush
214
-
215
-end:
216
-  sv_catpvn(x->dest, eol, sizeof(eol)-1);
217
-}
218
-
219
-
220
-// Adds a character to the current line, calls flushline() when a new line is done.
221
-// TODO: Convert \t into spaces? The rest of the code is written with the
222
-// assumption that \t does not occur in the string. I've not seen grotty output
223
-// tabs yet, but it's still a good idea to define what *we* do with tabs.
224
-static void appendline(ctx_t *x, char c, char f) {
225
-  if(c == '\r')
226
-    return;
227
-
228
-  if(c == '\n' || x->linelen > MAXLINE+1) {
229
-    x->line[x->linelen] = 0;
230
-    flushline(x);
231
-    x->linelen = 0;
232
-    x->noref = 0;
233
-    if(c == '\n')
234
-      return;
235
-  }
236
-
237
-  x->line[x->linelen] = c;
238
-  x->flags[x->linelen] = f;
239
-  x->linelen++;
240
-}
241
-
242
-
243
-// Parses the grotty escapes and calls appendline() for each character.
244
-static void parselines(ctx_t *x) {
245
-  int i, ini = 0, inb = 0;
246
-  const char *buf = x->src;
247
-
248
-  while(*buf) {
249
-    int c1 = UTF8SKIP(buf);
250
-    // Escape character right after a formatting code? Ignore the escape
251
-    // character and formatting code after that. Grotty sometimes
252
-    // double-formats a character, so you get "f ESC c ESC f ESC c", which you
253
-    // should read as "(f ESC c) ESC (f ESC c)".
254
-    if(*buf == 8 && buf[1] && buf[1+UTF8SKIP(buf+1)] == 8 && buf[2+UTF8SKIP(buf+1)]) {
255
-      int c2 = UTF8SKIP(buf+1);
256
-      buf += 2 + c2 + UTF8SKIP(buf+1+c2);
257
-      continue;
258
-    }
259
-    // Formatting code
260
-    if(buf[c1] == 8 && buf[c1+1]) {
261
-      int c2 = UTF8SKIP(buf+c1+1);
262
-      for(i=0; i<c2; i++)
263
-        appendline(x, buf[c1+i+1], *buf == '_' ? LI : LB);
264
-      buf += c1+c2+1;
265
-      continue;
266
-    }
267
-    // Regular character
268
-    if(*buf == '\n' && !buf[1])
269
-      x->noref = 1;
270
-    appendline(x, *buf, 0);
271
-    buf++;
272
-  }
273
-  x->noref = 1;
274
-  appendline(x, '\n', 0);
275
-}
276
-
10
+struct StringWrap grotty2html_wrap(const char *, unsigned long long);
11
+void grotty2html_free(struct StringWrap);
277 12
 
278 13
 
279 14
 MODULE = ManUtils	 PACKAGE = ManUtils
@@ -281,20 +16,13 @@ MODULE = ManUtils	 PACKAGE = ManUtils
281 16
 SV *
282 17
 html(str)
283 18
   SV *str
284
-  INIT:
285
-    ctx_t *x = malloc(sizeof(ctx_t));
286 19
   CODE:
287
-    x->src = SvPV_nolen(str);
288
-    x->dest = newSVpv("", 0);
289
-    x->linelen = 0;
290
-    x->noref = 1;
291
-    parselines(x);
292
-    // Set the UTF8 flag *after* generating the result string. For some reason
293
-    // that prevents sv_catpvf() from interpreting our C strings as something
294
-    // other than UTF-8.
295
-    SvUTF8_on(x->dest);
296
-    RETVAL = x->dest;
297
-    free(x);
20
+    STRLEN len;
21
+    char *inbuf = SvPV(str, len);
22
+    struct StringWrap buf = grotty2html_wrap(inbuf, len);
23
+    SV *dest = newSVpv(buf.buf, buf.len);
24
+    grotty2html_free(buf);
25
+    SvUTF8_on(dest);
26
+    RETVAL = dest;
298 27
   OUTPUT:
299 28
     RETVAL
300
-

+ 121
- 0
web/Cargo.lock View File

@@ -0,0 +1,121 @@
1
+[root]
2
+name = "web"
3
+version = "0.1.0"
4
+dependencies = [
5
+ "lazy_static 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
6
+ "regex 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
7
+]
8
+
9
+[[package]]
10
+name = "aho-corasick"
11
+version = "0.6.1"
12
+source = "registry+https://github.com/rust-lang/crates.io-index"
13
+dependencies = [
14
+ "memchr 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
15
+]
16
+
17
+[[package]]
18
+name = "kernel32-sys"
19
+version = "0.2.2"
20
+source = "registry+https://github.com/rust-lang/crates.io-index"
21
+dependencies = [
22
+ "winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
23
+ "winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
24
+]
25
+
26
+[[package]]
27
+name = "lazy_static"
28
+version = "0.2.2"
29
+source = "registry+https://github.com/rust-lang/crates.io-index"
30
+
31
+[[package]]
32
+name = "libc"
33
+version = "0.2.19"
34
+source = "registry+https://github.com/rust-lang/crates.io-index"
35
+
36
+[[package]]
37
+name = "memchr"
38
+version = "1.0.1"
39
+source = "registry+https://github.com/rust-lang/crates.io-index"
40
+dependencies = [
41
+ "libc 0.2.19 (registry+https://github.com/rust-lang/crates.io-index)",
42
+]
43
+
44
+[[package]]
45
+name = "regex"
46
+version = "0.2.1"
47
+source = "registry+https://github.com/rust-lang/crates.io-index"
48
+dependencies = [
49
+ "aho-corasick 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
50
+ "memchr 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
51
+ "regex-syntax 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
52
+ "thread_local 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
53
+ "utf8-ranges 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
54
+]
55
+
56
+[[package]]
57
+name = "regex-syntax"
58
+version = "0.4.0"
59
+source = "registry+https://github.com/rust-lang/crates.io-index"
60
+
61
+[[package]]
62
+name = "thread-id"
63
+version = "3.0.0"
64
+source = "registry+https://github.com/rust-lang/crates.io-index"
65
+dependencies = [
66
+ "kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
67
+ "libc 0.2.19 (registry+https://github.com/rust-lang/crates.io-index)",
68
+]
69
+
70
+[[package]]
71
+name = "thread_local"
72
+version = "0.3.2"
73
+source = "registry+https://github.com/rust-lang/crates.io-index"
74
+dependencies = [
75
+ "thread-id 3.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
76
+ "unreachable 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
77
+]
78
+
79
+[[package]]
80
+name = "unreachable"
81
+version = "0.1.1"
82
+source = "registry+https://github.com/rust-lang/crates.io-index"
83
+dependencies = [
84
+ "void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
85
+]
86
+
87
+[[package]]
88
+name = "utf8-ranges"
89
+version = "1.0.0"
90
+source = "registry+https://github.com/rust-lang/crates.io-index"
91
+
92
+[[package]]
93
+name = "void"
94
+version = "1.0.2"
95
+source = "registry+https://github.com/rust-lang/crates.io-index"
96
+
97
+[[package]]
98
+name = "winapi"
99
+version = "0.2.8"
100
+source = "registry+https://github.com/rust-lang/crates.io-index"
101
+
102
+[[package]]
103
+name = "winapi-build"
104
+version = "0.1.1"
105
+source = "registry+https://github.com/rust-lang/crates.io-index"
106
+
107
+[metadata]
108
+"checksum aho-corasick 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)" = "4f660b942762979b56c9f07b4b36bb559776fbad102f05d6771e1b629e8fd5bf"
109
+"checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d"
110
+"checksum lazy_static 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6abe0ee2e758cd6bc8a2cd56726359007748fbf4128da998b65d0b70f881e19b"
111
+"checksum libc 0.2.19 (registry+https://github.com/rust-lang/crates.io-index)" = "9e030dc72013ed68994d1b2cbf36a94dd0e58418ba949c4b0db7eeb70a7a6352"
112
+"checksum memchr 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "1dbccc0e46f1ea47b9f17e6d67c5a96bd27030519c519c9c91327e31275a47b4"
113
+"checksum regex 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "4278c17d0f6d62dfef0ab00028feb45bd7d2102843f80763474eeb1be8a10c01"
114
+"checksum regex-syntax 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "2f9191b1f57603095f105d317e375d19b1c9c5c3185ea9633a99a6dcbed04457"
115
+"checksum thread-id 3.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "4437c97558c70d129e40629a5b385b3fb1ffac301e63941335e4d354081ec14a"
116
+"checksum thread_local 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7793b722f0f77ce716e7f1acf416359ca32ff24d04ffbac4269f44a4a83be05d"
117
+"checksum unreachable 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "1f2ae5ddb18e1c92664717616dd9549dde73f539f01bd7b77c2edb2446bdff91"
118
+"checksum utf8-ranges 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "662fab6525a98beff2921d7f61a39e7d59e0b425ebc7d0d9e66d316e55124122"
119
+"checksum void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d"
120
+"checksum winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a"
121
+"checksum winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc"

+ 16
- 0
web/Cargo.toml View File

@@ -0,0 +1,16 @@
1
+[package]
2
+name = "web"
3
+version = "0.1.0"
4
+authors = ["yorhel"]
5
+
6
+[lib]
7
+name = "web"
8
+crate-type = ["lib", "staticlib"]
9
+
10
+[dependencies]
11
+regex = "0.2.1"
12
+lazy_static = "0.2.2"
13
+
14
+# Add debugging symbols even in release mode, in order to help with profiling.
15
+[profile.release]
16
+debug = true

+ 348
- 0
web/src/lib.rs View File

@@ -0,0 +1,348 @@
1
+#![feature(test)]
2
+extern crate test;
3
+extern crate regex;
4
+#[macro_use] extern crate lazy_static;
5
+
6
+use std::fmt::Write;
7
+use regex::Regex;
8
+
9
+
10
+#[derive(Clone,Copy,PartialEq,Eq)]
11
+enum FmtChar {
12
+    Regular,
13
+    Italic,
14
+    Bold,
15
+}
16
+
17
+
18
+/* Simple state machine to parse the following grammar:
19
+ *
20
+ * fmtchar       = escape | double-escape | char
21
+ * escape        = tag ESC char
22
+ * double-escape = ESC tag ESC char
23
+ * tag           = "_"  # italic
24
+ *               | char # bold
25
+ *
26
+ * This format is described as "old behaviour" in grotty(1).  The double-escape
27
+ * seems to be a weird glitch, and can be interpreted as
28
+ * "(tag ESC char) ESC (tag ESC char)".  This parser simply skips over any such
29
+ * sequence starting with ESC. */
30
+enum CharParse {
31
+    Start,
32
+    One(char),      // Seen a single character (either 'char' or 'escape')
33
+    Escape(char),   // Seen a single character + escape
34
+    DoubleEsc(u32), // Inside a double-escape, indicates number of characters left to skip
35
+}
36
+
37
+
38
+impl CharParse {
39
+    fn update(&mut self, chr: char) -> Option<(char, FmtChar)> {
40
+        match *self {
41
+
42
+            CharParse::Start => {
43
+                *self = if chr == 8 as char { CharParse::DoubleEsc(2) } else { CharParse::One(chr) };
44
+                None
45
+            },
46
+
47
+            CharParse::One(c) =>
48
+                if chr == 8 as char {
49
+                    *self = CharParse::Escape(c);
50
+                    None
51
+                } else {
52
+                    *self = CharParse::One(chr);
53
+                    Some((c, FmtChar::Regular))
54
+                },
55
+
56
+            CharParse::Escape(c) => {
57
+                *self = CharParse::Start;
58
+                Some((chr, if c == '_' { FmtChar::Italic } else { FmtChar::Bold }))
59
+            },
60
+
61
+            CharParse::DoubleEsc(n) => {
62
+                *self = if n == 0 { CharParse::Start } else { CharParse::DoubleEsc(n-1) };
63
+                None
64
+            },
65
+        }
66
+    }
67
+}
68
+
69
+
70
+fn pushfmt(out: &mut String, old: FmtChar, new: FmtChar) {
71
+    if new != old && old != FmtChar::Regular {
72
+        out.push_str(if old == FmtChar::Italic { "</i>" } else { "</b>" });
73
+    }
74
+    if new != old && new != FmtChar::Regular {
75
+        out.push_str(if new == FmtChar::Italic { "<i>" } else { "<b>" });
76
+    }
77
+}
78
+
79
+
80
+// Intermediate text buffer. This buffer contains the entire HTML-escaped man page and a list of
81
+// indices where text formatting changes are performed.
82
+struct FmtBuf {
83
+    buf: String,
84
+    // List of formatting chunks. The number indicates the character index where the formatting
85
+    // ends. E.g. [(5,Regular),(10,Bold),(15,Italic)] means:
86
+    //   [0..5] is Regular
87
+    //   [5..10] is Bold
88
+    //   [10..15] is Italic
89
+    fmt: Vec<(usize,FmtChar)>,
90
+    lastfmt: FmtChar,
91
+}
92
+
93
+// Output state
94
+struct Flush<'a, 'b> {
95
+    out: &'a mut String,
96
+    idx: usize, // Last byte in the buffer that has been processed
97
+    fmt: std::iter::Peekable<std::slice::Iter<'b, (usize,FmtChar)>>, // Iterator over FmtBuf.fmt
98
+}
99
+
100
+
101
+impl FmtBuf {
102
+    fn push(&mut self, chr: char, fmt: FmtChar) {
103
+        // Consider whitespace and underscore to have the same
104
+        // formatting as the previous character; This generates smaller
105
+        // HTML, and you can't see the difference anyway.
106
+        if self.lastfmt != fmt && !(chr == ' ' || chr == '_') {
107
+            self.fmt.push((self.buf.len(), self.lastfmt));
108
+            self.lastfmt = fmt;
109
+        }
110
+        match chr {
111
+            '>' => self.buf.push_str("&gt;"),
112
+            '<' => self.buf.push_str("&lt;"),
113
+            '&' => self.buf.push_str("&amp;"),
114
+            // '"' => self.buf.push_str("&quot;"), // TEMPORARILY disabled for comparison with old code
115
+            _   => self.buf.push(chr), // <- 30% of the entire processing time is spent here.
116
+        }
117
+    }
118
+
119
+    // Flush all unprocessed bytes until 'end' to the output
120
+    fn flush_to(&self, st: &mut Flush, end: usize) {
121
+        let mut lastfmt = FmtChar::Regular;
122
+        while st.idx < end {
123
+            let &&(chunk, fmt) = st.fmt.peek().unwrap();
124
+            let chunk = if chunk > end {
125
+                end
126
+            } else {
127
+                st.fmt.next();
128
+                chunk
129
+            };
130
+            pushfmt(st.out, lastfmt, fmt);
131
+            st.out.push_str(&self.buf[st.idx..chunk]);
132
+            st.idx = chunk;
133
+            lastfmt = fmt;
134
+        }
135
+        pushfmt(st.out, lastfmt, FmtChar::Regular);
136
+    }
137
+
138
+    // Consume the input buffer until 'end' without generating output
139
+    fn flush_skip(&self, st: &mut Flush, end: usize) {
140
+        st.idx = end;
141
+        while st.fmt.peek().unwrap().0 <= st.idx {
142
+            st.fmt.next();
143
+        }
144
+    }
145
+
146
+    fn flush_include(&self, st: &mut Flush, start: usize, end: usize) {
147
+        lazy_static!(
148
+            static ref REF: Regex = Regex::new(r"^((?:[^\s\]]*/)?([^\s/\]]+))\]\]\]").unwrap();
149
+        );
150
+        let m = match REF.captures(&self.buf[end..]) { Some(x) => x, None => return };
151
+
152
+        self.flush_to(st, start);
153
+        st.out.push_str("\n&gt;&gt; Included man page: <a href=\"/");
154
+        // Replace ‐ (U+2010) with - (U+2d). ASCII dashes are replaced with an Unicode dash
155
+        // when passed through groff, which we need to revert in order to get the link working.
156
+        // (Apparently it recognizes man page references and URLs, as it doesn't do this
157
+        // replacement in those situations.)
158
+        for c in m[2].chars() {
159
+            st.out.push(if c == '‐' { '-' } else { c });
160
+        }
161
+        st.out.push_str("\">");
162
+        st.out.push_str(&m[1]);
163
+        st.out.push_str("</a>");
164
+        self.flush_skip(st, end + m[0].len());
165
+    }
166
+
167
+    fn flush_url(&self, st: &mut Flush, start: usize) {
168
+        lazy_static!(
169
+            // Some characters considered to never be part of a URL.
170
+            // (Note that we can't match literal ><" because of the HTML escaping done previously)
171
+            static ref URLEND: Regex = Regex::new("(?:\"|&quot;|&gt;|&lt;|\\s)").unwrap();
172
+        );
173
+        let urlend = match URLEND.find(&self.buf[start..]) { Some(x) => x, None => return };
174
+
175
+        self.flush_to(st, start);
176
+        let url = &self.buf[start..(start + urlend.start())];
177
+
178
+        // Also catch a Unicode '⟩', which is how groff sometimes ends a .UR, e.g.:
179
+        // - https://manned.org/troff/c4467840
180
+        // - https://manned.org/pass/78413b49
181
+        // - https://manned.org/empathy-accounts/8c05b2c1
182
+        // - https://manned.org/urn/8cb83e85
183
+        // TODO: Check the character before the start of the URL, and only remove ) if there is a
184
+        // starting ( before it.
185
+        let url = url.trim_right_matches('.').trim_right_matches(',').trim_right_matches(';').trim_right_matches(')').trim_right_matches('⟩');
186
+
187
+        write!(st.out, "<a href=\"{0}\" rel=\"nofollow\">{0}</a>", url).unwrap();
188
+        self.flush_skip(st, start + url.len());
189
+    }
190
+
191
+    fn flush_ref(&self, st: &mut Flush, end: usize) {
192
+        // We know where the closing bracket is in the string, so this regex is used to search
193
+        // backwards from there and find the start of the reference.
194
+        lazy_static!(
195
+            static ref REF: Regex = Regex::new(r"([A-Za-z0-9\._-]+)\(([1-8nl])\)$").unwrap();
196
+        );
197
+
198
+        // Disallow some characters following a reference
199
+        if self.buf.len() > end {
200
+            let ch = self.buf[end..].chars().next().unwrap();
201
+            if ch == '-' || ch == '_' || ch.is_alphanumeric() {
202
+                return;
203
+            }
204
+        }
205
+
206
+        let m = REF.captures(&self.buf[..end]).unwrap();
207
+        self.flush_to(st, end - m[0].len());
208
+        self.flush_skip(st, end);
209
+        write!(st.out, "<a href=\"/{}.{}\">{}</a>", &m[1], &m[2], &m[0]).unwrap();
210
+    }
211
+
212
+    fn flush(&mut self, out: &mut String) {
213
+        self.fmt.push((self.buf.len(), FmtChar::Regular));
214
+
215
+        // Find the indices where the first line ends, and the last line starts. These are used to
216
+        // efficiently disable reference formatting on the first and last line.
217
+        let firstlineend = self.buf.find('\n').unwrap_or(self.buf.len());
218
+        let lastlinestart = self.buf.trim_right_matches('\n').rfind('\n').unwrap_or(0);
219
+
220
+        // This regex is used to quickly *find* interesting patterns, any further validation
221
+        // and processing is done afterwards by the (slower) specialized flush_ methods.
222
+        lazy_static!(
223
+            static ref SEARCH: Regex = Regex::new(r"(?m)(^\[\[\[MANNEDINCLUDE|https?://|[A-Za-z0-9]+\([1-8nl]\))").unwrap();
224
+        );
225
+
226
+        let mut st = Flush{
227
+            out: out,
228
+            idx: 0,
229
+            fmt: self.fmt.iter().peekable(),
230
+        };
231
+
232
+        for i in SEARCH.find_iter(&self.buf) {
233
+            // This can happen with overlapping detections, e.g. when something inside a URL looks
234
+            // like a man page reference.
235
+            if st.idx > i.start() {
236
+                continue;
237
+            }
238
+            let allowref = i.start() > firstlineend && i.start() < lastlinestart;
239
+            match self.buf.as_bytes()[i.end()-1] {
240
+                0x45 /* E */ => self.flush_include(&mut st, i.start(), i.end()),
241
+                0x2F /* / */ if allowref => self.flush_url(&mut st, i.start()),
242
+                _            if allowref => self.flush_ref(&mut st, i.end()),
243
+                _ => {}
244
+            }
245
+        }
246
+        self.flush_to(&mut st, self.buf.len());
247
+    }
248
+}
249
+
250
+
251
+pub fn grotty2html(input: &str) -> String {
252
+    let mut state = CharParse::Start;
253
+
254
+    let mut buf = FmtBuf{
255
+        buf: String::with_capacity(128),
256
+        fmt: Vec::with_capacity(128),
257
+        lastfmt: FmtChar::Regular,
258
+    };
259
+
260
+    for chr in input.chars() {
261
+        if let Some((chr, fmt)) = state.update(chr) {
262
+            buf.push(chr, fmt);
263
+            // Line-based flushing is also possible, but not as fast.
264
+            //if chr == '\n' {
265
+            //    buf.flush(&mut out);
266
+            //    buf.buf.clear();
267
+            //    buf.fmt.clear();
268
+            //    buf.lastfmt = FmtChar::Regular;
269
+            //}
270
+        }
271
+    }
272
+    if let CharParse::One(chr) = state {
273
+        buf.push(chr, FmtChar::Regular);
274
+    }
275
+
276
+    let mut out = String::with_capacity(input.len());
277
+    buf.flush(&mut out);
278
+    out
279
+}
280
+
281
+
282
+
283
+use std::os::raw::c_ulonglong;
284
+
285
+#[repr(C)]
286
+pub struct StringWrap {
287
+    buf: *mut u8,
288
+    len: c_ulonglong,
289
+    cap: c_ulonglong,
290
+}
291
+
292
+#[no_mangle]
293
+pub extern fn grotty2html_wrap(in_buf: *const u8, in_len: c_ulonglong) -> StringWrap {
294
+    let input = unsafe { std::str::from_utf8_unchecked( std::slice::from_raw_parts(in_buf, in_len as usize) ) };
295
+    let mut out = grotty2html(input).into_bytes();
296
+    let r = StringWrap {
297
+        buf: out.as_mut_ptr(),
298
+        len: out.len() as c_ulonglong,
299
+        cap: out.capacity() as c_ulonglong,
300
+    };
301
+    std::mem::forget(out);
302
+    r
303
+}
304
+
305
+#[no_mangle]
306
+pub extern fn grotty2html_free(buf: StringWrap) {
307
+    unsafe { Vec::from_raw_parts(buf.buf, buf.len as usize, buf.cap as usize) };
308
+}
309
+
310
+
311
+#[cfg(test)]
312
+mod tests {
313
+    use super::*;
314
+    use std::io::Read;
315
+    use test::Bencher;
316
+
317
+    fn bench_file(b: &mut Bencher, f: &str) {
318
+        let mut f = std::fs::File::open(f).unwrap();
319
+        let mut buf = String::new();
320
+        f.read_to_string(&mut buf).unwrap();
321
+
322
+        b.iter(|| {
323
+            test::black_box(grotty2html(&buf));
324
+        });
325
+    }
326
+
327
+    #[bench]
328
+    fn bench_rsync(b: &mut test::Bencher) {
329
+        bench_file(b, "t/rsync.1.output");
330
+    }
331
+
332
+    #[bench]
333
+    fn bench_ncdu(b: &mut test::Bencher) {
334
+        bench_file(b, "t/ncdu.1.output");
335
+    }
336
+
337
+    #[bench]
338
+    fn bench_javadoc(b: &mut test::Bencher) {
339
+        bench_file(b, "t/javadoc.1.output");
340
+    }
341
+
342
+    /*
343
+    #[bench]
344
+    fn bench_wfilter(b: &mut test::Bencher) {
345
+        bench_file(b, "t/wfilter.4.output");
346
+    }
347
+    */
348
+}

+ 11
- 0
web/src/main.rs View File

@@ -0,0 +1,11 @@
1
+extern crate web;
2
+
3
+use std::io::{stdin,Read};
4
+
5
+fn main() {
6
+    let rd = stdin();
7
+    let mut buf = String::new();
8
+    rd.lock().read_to_string(&mut buf).unwrap();
9
+    println!("{}", web::grotty2html(&buf));
10
+}
11
+