-
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathencoding.go
More file actions
105 lines (90 loc) · 2.65 KB
/
encoding.go
File metadata and controls
105 lines (90 loc) · 2.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
package wpry
import (
"bytes"
"unicode"
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/charmap"
xunicode "golang.org/x/text/encoding/unicode"
"golang.org/x/text/encoding/unicode/utf32"
"golang.org/x/text/transform"
)
// convertToUTF8 performs a best-effort conversion of the provided byte slice to
// UTF-8.
//
// Heuristics (in order):
// 1. If input is already valid UTF-8, return it (strip UTF-8 BOM if present)
// 2. Check for UTF-32 BOMs (BE/LE) and decode when present
// 3. Check for UTF-16 BOMs (BE/LE) and decode when present
// 4. Try Windows-1252
// 5. Try ISO-8859-1
//
// Only accept a decoded result if it's valid UTF-8 and passes a small set of
// sanity checks (no replacement chars, no NULs, no Unicode non-characters, and
// no unexpected control characters). If all attempts fail, return the original
// bytes unchanged.
func convertToUTF8(content []byte) []byte {
// Strip UTF-8 BOM up front.
content = bytes.TrimPrefix(content, []byte{0xEF, 0xBB, 0xBF})
if utf8.Valid(content) {
return content
}
encodings := []encoding.Encoding{
utf32.UTF32(utf32.BigEndian, utf32.ExpectBOM),
xunicode.UTF16(xunicode.BigEndian, xunicode.ExpectBOM),
charmap.Windows1252,
charmap.ISO8859_1,
}
for _, enc := range encodings {
dec := enc.NewDecoder()
out, ok := tryDecode(dec, content)
if !ok {
continue
}
return out
}
return content
}
// tryDecode applies the provided transformer to content and returns the
// transformed bytes if they are valid UTF-8 and pass the sanity checks.
func tryDecode(dec transform.Transformer, content []byte) ([]byte, bool) {
transformed, _, err := transform.Bytes(dec, content)
if err != nil {
return nil, false
}
if !utf8.Valid(transformed) {
return nil, false
}
// Quick check for UTF-8 replacement character sequence (U+FFFD).
if bytes.Contains(transformed, []byte{0xEF, 0xBF, 0xBD}) {
return nil, false
}
for i := 0; i < len(transformed); {
r, sz := utf8.DecodeRune(transformed[i:])
i += sz
if r == utf8.RuneError || isBadRune(r) {
return nil, false
}
}
return transformed, true
}
// isBadRune reports whether r is a rune that indicates a decode failure or
// otherwise makes the decoded content unsuitable for parsing headers.
func isBadRune(r rune) bool {
if r == 0 {
return true
}
// Unicode non-characters: U+FDD0..U+FDEF
if r >= 0xFDD0 && r <= 0xFDEF {
return true
}
// Plane-end non-characters: any code point whose low 16 bits are 0xFFFE or 0xFFFF
if (r&0xFFFF) == 0xFFFE || (r&0xFFFF) == 0xFFFF {
return true
}
// Control characters (except allowed whitespace)
if unicode.IsControl(r) && !unicode.IsSpace(r) {
return true
}
return false
}