-
Notifications
You must be signed in to change notification settings - Fork 15
/
detector.go
147 lines (131 loc) · 3.87 KB
/
detector.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
// Package chardet ports character set detection from ICU.
package chardet
import (
"errors"
"sort"
)
// Result contains all the information that charset detector gives.
type Result struct {
// IANA name of the detected charset.
Charset string
// IANA name of the detected language. It may be empty for some charsets.
Language string
// Confidence of the Result. Scale from 1 to 100. The bigger, the more confident.
Confidence int
}
// Detector implements charset detection.
type Detector struct {
recognizers []recognizer
stripTag bool
}
// List of charset recognizers
var recognizers = []recognizer{
newRecognizer_utf8(),
newRecognizer_utf16be(),
newRecognizer_utf16le(),
newRecognizer_utf32be(),
newRecognizer_utf32le(),
newRecognizer_8859_1_en(),
newRecognizer_8859_1_da(),
newRecognizer_8859_1_de(),
newRecognizer_8859_1_es(),
newRecognizer_8859_1_fr(),
newRecognizer_8859_1_it(),
newRecognizer_8859_1_nl(),
newRecognizer_8859_1_no(),
newRecognizer_8859_1_pt(),
newRecognizer_8859_1_sv(),
newRecognizer_8859_2_cs(),
newRecognizer_8859_2_hu(),
newRecognizer_8859_2_pl(),
newRecognizer_8859_2_ro(),
newRecognizer_8859_5_ru(),
newRecognizer_8859_6_ar(),
newRecognizer_8859_7_el(),
newRecognizer_8859_8_I_he(),
newRecognizer_8859_8_he(),
newRecognizer_windows_1251(),
newRecognizer_windows_1256(),
newRecognizer_KOI8_R(),
newRecognizer_8859_9_tr(),
newRecognizer_sjis(),
newRecognizer_gb_18030(),
newRecognizer_euc_jp(),
newRecognizer_euc_kr(),
newRecognizer_big5(),
newRecognizer_2022JP(),
newRecognizer_2022KR(),
newRecognizer_2022CN(),
newRecognizer_IBM424_he_rtl(),
newRecognizer_IBM424_he_ltr(),
newRecognizer_IBM420_ar_rtl(),
newRecognizer_IBM420_ar_ltr(),
}
// NewTextDetector creates a Detector for plain text.
func NewTextDetector() *Detector {
return &Detector{recognizers, false}
}
// NewHtmlDetector creates a Detector for Html.
func NewHtmlDetector() *Detector {
return &Detector{recognizers, true}
}
var (
NotDetectedError = errors.New("Charset not detected.")
)
// DetectBest returns the Result with highest Confidence.
func (d *Detector) DetectBest(b []byte) (r *Result, err error) {
input := newRecognizerInput(b, d.stripTag)
outputChan := make(chan recognizerOutput)
for _, r := range d.recognizers {
go matchHelper(r, input, outputChan)
}
var output Result
for i := 0; i < len(d.recognizers); i++ {
o := <-outputChan
if output.Confidence < o.Confidence {
output = Result(o)
}
}
if output.Confidence == 0 {
return nil, NotDetectedError
}
return &output, nil
}
// DetectAll returns all Results which have non-zero Confidence. The Results are sorted by Confidence in descending order.
func (d *Detector) DetectAll(b []byte) ([]Result, error) {
input := newRecognizerInput(b, d.stripTag)
outputChan := make(chan recognizerOutput)
for _, r := range d.recognizers {
go matchHelper(r, input, outputChan)
}
outputs := make(recognizerOutputs, 0, len(d.recognizers))
for i := 0; i < len(d.recognizers); i++ {
o := <-outputChan
if o.Confidence > 0 {
outputs = append(outputs, o)
}
}
if len(outputs) == 0 {
return nil, NotDetectedError
}
sort.Sort(outputs)
dedupOutputs := make([]Result, 0, len(outputs))
foundCharsets := make(map[string]struct{}, len(outputs))
for _, o := range outputs {
if _, found := foundCharsets[o.Charset]; !found {
dedupOutputs = append(dedupOutputs, Result(o))
foundCharsets[o.Charset] = struct{}{}
}
}
if len(dedupOutputs) == 0 {
return nil, NotDetectedError
}
return dedupOutputs, nil
}
func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput) {
outputChan <- r.Match(input)
}
type recognizerOutputs []recognizerOutput
func (r recognizerOutputs) Len() int { return len(r) }
func (r recognizerOutputs) Less(i, j int) bool { return r[i].Confidence > r[j].Confidence }
func (r recognizerOutputs) Swap(i, j int) { r[i], r[j] = r[j], r[i] }