Browse Source

A few more HTML conversion improvements

- Fix segfault on empty output (bug was in XS code)
- Still better end-of-URL detection
- Recognize a few common multicharacter sections in man references
Yorhel 2 years ago
parent
commit
746889851c
3 changed files with 18 additions and 5 deletions
  1. 1
    1
      Makefile
  2. 1
    1
      lib/ManUtils/ManUtils.xs
  3. 16
    3
      web/src/lib.rs

+ 1
- 1
Makefile View File

@@ -6,7 +6,7 @@ all: ManUtils indexer
6 6
 ManUtils: lib/ManUtils/inst/lib/perl5/x86_64-linux/ManUtils.pm
7 7
 
8 8
 lib/ManUtils/inst/lib/perl5/x86_64-linux/ManUtils.pm: lib/ManUtils/Build.PL lib/ManUtils/ManUtils.pm lib/ManUtils/ManUtils.xs web/target/release/libweb.a
9
-	test lib/ManUtils/ManUtils.xs -ot web/target/release/libweb.a && touch -r web/target/release/libweb.a lib/ManUtils/ManUtils.xs
9
+	-test lib/ManUtils/ManUtils.xs -ot web/target/release/libweb.a && touch -r web/target/release/libweb.a lib/ManUtils/ManUtils.xs
10 10
 	cd lib/ManUtils && perl Build.PL && ./Build install --install-base=inst
11 11
 	touch lib/ManUtils/inst/lib/perl5/x86_64-linux/ManUtils.pm
12 12
 

+ 1
- 1
lib/ManUtils/ManUtils.xs View File

@@ -20,7 +20,7 @@ html(str)
20 20
     STRLEN len;
21 21
     char *inbuf = SvPV(str, len);
22 22
     struct StringWrap buf = grotty2html_wrap(inbuf, len);
23
-    SV *dest = newSVpv(buf.buf, buf.len);
23
+    SV *dest = buf.len ? newSVpv(buf.buf, buf.len) : newSVpv("", 0);
24 24
     grotty2html_free(buf);
25 25
     SvUTF8_on(dest);
26 26
     RETVAL = dest;

+ 16
- 3
web/src/lib.rs View File

@@ -181,7 +181,9 @@ impl FmtBuf {
181 181
         // - https://manned.org/urn/8cb83e85
182 182
         // TODO: Add heuristic to only remove ) at the end of the URL if there is no matching (
183 183
         // inside the URL.
184
-        let url = url.trim_right_matches('.').trim_right_matches(',').trim_right_matches(';').trim_right_matches(')').trim_right_matches('⟩').trim_right_matches('\'');
184
+        let url = url.trim_right_matches(|c|
185
+            match c { '.' | ',' | ';' | ')' | '⟩' | '\'' | ':' | ']' | '}' => true, _ => false }
186
+        );
185 187
         if url.len() < 10 {
186 188
             return;
187 189
         }
@@ -194,8 +196,19 @@ impl FmtBuf {
194 196
     fn flush_ref(&self, st: &mut Flush, end: usize) {
195 197
         // We know where the closing bracket is in the string, so this regex is used to search
196 198
         // backwards from there and find the start of the reference.
199
+        // There are a lot of 'special' multi-character section names, so it might not make sense
200
+        // to parse all of them. Here's an estimate of a few 'special' section references, in
201
+        // number of man pages using the reference (using ~ '%(3pm)%' on the 2017-01-14 database):
202
+        // - 3pm    17810
203
+        // - 3w      8729 (just a few packages)
204
+        // - 3tcl    2000
205
+        // - 3tk      758
206
+        // - 3p       309
207
+        // - 3perl    268
208
+        // - 3ssl     198
197 209
         lazy_static!(
198
-            static ref REF: Regex = Regex::new(r"([A-Za-z0-9:\._-]+)\(([1-9nl])\)$").unwrap();
210
+            // XXX: Make sure to keep this regex in sync with the one in flush()
211
+            static ref REF: Regex = Regex::new(r"([A-Za-z0-9:\._-]+)\(([1-9nl]|3tcl|3pm|3tk)\)$").unwrap();
199 212
         );
200 213
 
201 214
         // Disallow some characters following a reference
@@ -223,7 +236,7 @@ impl FmtBuf {
223 236
         // This regex is used to quickly *find* interesting patterns, any further validation
224 237
         // and processing is done afterwards by the (slower) specialized flush_ methods.
225 238
         lazy_static!(
226
-            static ref SEARCH: Regex = Regex::new(r"(?m)(^\[\[\[MANNEDINCLUDE|https?://|[A-Za-z0-9]+\([1-9nl]\))").unwrap();
239
+            static ref SEARCH: Regex = Regex::new(r"(?m)(^\[\[\[MANNEDINCLUDE|https?://|[A-Za-z0-9]+\(([1-9nl]|3tcl|3pm|3tk)\))").unwrap();
227 240
         );
228 241
 
229 242
         let mut st = Flush{

Loading…
Cancel
Save