356 lines
8.5 KiB
Go
356 lines
8.5 KiB
Go
// Copyright 2014 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// Package runes provide transforms for UTF-8 encoded text.
|
|
package runes // import "golang.org/x/text/runes"
|
|
|
|
import (
|
|
"unicode"
|
|
"unicode/utf8"
|
|
|
|
"golang.org/x/text/transform"
|
|
)
|
|
|
|
// A Set is a collection of runes.
|
|
type Set interface {
|
|
// Contains returns true if r is contained in the set.
|
|
Contains(r rune) bool
|
|
}
|
|
|
|
type setFunc func(rune) bool
|
|
|
|
func (s setFunc) Contains(r rune) bool {
|
|
return s(r)
|
|
}
|
|
|
|
// Note: using funcs here instead of wrapping types result in cleaner
|
|
// documentation and a smaller API.
|
|
|
|
// In creates a Set with a Contains method that returns true for all runes in
|
|
// the given RangeTable.
|
|
func In(rt *unicode.RangeTable) Set {
|
|
return setFunc(func(r rune) bool { return unicode.Is(rt, r) })
|
|
}
|
|
|
|
// In creates a Set with a Contains method that returns true for all runes not
|
|
// in the given RangeTable.
|
|
func NotIn(rt *unicode.RangeTable) Set {
|
|
return setFunc(func(r rune) bool { return !unicode.Is(rt, r) })
|
|
}
|
|
|
|
// Predicate creates a Set with a Contains method that returns f(r).
|
|
func Predicate(f func(rune) bool) Set {
|
|
return setFunc(f)
|
|
}
|
|
|
|
// Transformer implements the transform.Transformer interface.
|
|
type Transformer struct {
|
|
t transform.SpanningTransformer
|
|
}
|
|
|
|
func (t Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
|
return t.t.Transform(dst, src, atEOF)
|
|
}
|
|
|
|
func (t Transformer) Span(b []byte, atEOF bool) (n int, err error) {
|
|
return t.t.Span(b, atEOF)
|
|
}
|
|
|
|
func (t Transformer) Reset() { t.t.Reset() }
|
|
|
|
// Bytes returns a new byte slice with the result of converting b using t. It
|
|
// calls Reset on t. It returns nil if any error was found. This can only happen
|
|
// if an error-producing Transformer is passed to If.
|
|
func (t Transformer) Bytes(b []byte) []byte {
|
|
b, _, err := transform.Bytes(t, b)
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
return b
|
|
}
|
|
|
|
// String returns a string with the result of converting s using t. It calls
|
|
// Reset on t. It returns the empty string if any error was found. This can only
|
|
// happen if an error-producing Transformer is passed to If.
|
|
func (t Transformer) String(s string) string {
|
|
s, _, err := transform.String(t, s)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
return s
|
|
}
|
|
|
|
// TODO:
|
|
// - Copy: copying strings and bytes in whole-rune units.
|
|
// - Validation (maybe)
|
|
// - Well-formed-ness (maybe)
|
|
|
|
const runeErrorString = string(utf8.RuneError)
|
|
|
|
// Remove returns a Transformer that removes runes r for which s.Contains(r).
|
|
// Illegal input bytes are replaced by RuneError before being passed to f.
|
|
func Remove(s Set) Transformer {
|
|
if f, ok := s.(setFunc); ok {
|
|
// This little trick cuts the running time of BenchmarkRemove for sets
|
|
// created by Predicate roughly in half.
|
|
// TODO: special-case RangeTables as well.
|
|
return Transformer{remove(f)}
|
|
}
|
|
return Transformer{remove(s.Contains)}
|
|
}
|
|
|
|
// TODO: remove transform.RemoveFunc.
|
|
|
|
type remove func(r rune) bool
|
|
|
|
func (remove) Reset() {}
|
|
|
|
// Span implements transform.Spanner.
|
|
func (t remove) Span(src []byte, atEOF bool) (n int, err error) {
|
|
for r, size := rune(0), 0; n < len(src); {
|
|
if r = rune(src[n]); r < utf8.RuneSelf {
|
|
size = 1
|
|
} else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
|
|
// Invalid rune.
|
|
if !atEOF && !utf8.FullRune(src[n:]) {
|
|
err = transform.ErrShortSrc
|
|
} else {
|
|
err = transform.ErrEndOfSpan
|
|
}
|
|
break
|
|
}
|
|
if t(r) {
|
|
err = transform.ErrEndOfSpan
|
|
break
|
|
}
|
|
n += size
|
|
}
|
|
return
|
|
}
|
|
|
|
// Transform implements transform.Transformer.
|
|
func (t remove) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
|
for r, size := rune(0), 0; nSrc < len(src); {
|
|
if r = rune(src[nSrc]); r < utf8.RuneSelf {
|
|
size = 1
|
|
} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
|
|
// Invalid rune.
|
|
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
|
err = transform.ErrShortSrc
|
|
break
|
|
}
|
|
// We replace illegal bytes with RuneError. Not doing so might
|
|
// otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
|
|
// The resulting byte sequence may subsequently contain runes
|
|
// for which t(r) is true that were passed unnoticed.
|
|
if !t(utf8.RuneError) {
|
|
if nDst+3 > len(dst) {
|
|
err = transform.ErrShortDst
|
|
break
|
|
}
|
|
dst[nDst+0] = runeErrorString[0]
|
|
dst[nDst+1] = runeErrorString[1]
|
|
dst[nDst+2] = runeErrorString[2]
|
|
nDst += 3
|
|
}
|
|
nSrc++
|
|
continue
|
|
}
|
|
if t(r) {
|
|
nSrc += size
|
|
continue
|
|
}
|
|
if nDst+size > len(dst) {
|
|
err = transform.ErrShortDst
|
|
break
|
|
}
|
|
for i := 0; i < size; i++ {
|
|
dst[nDst] = src[nSrc]
|
|
nDst++
|
|
nSrc++
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
// Map returns a Transformer that maps the runes in the input using the given
|
|
// mapping. Illegal bytes in the input are converted to utf8.RuneError before
|
|
// being passed to the mapping func.
|
|
func Map(mapping func(rune) rune) Transformer {
|
|
return Transformer{mapper(mapping)}
|
|
}
|
|
|
|
type mapper func(rune) rune
|
|
|
|
func (mapper) Reset() {}
|
|
|
|
// Span implements transform.Spanner.
|
|
func (t mapper) Span(src []byte, atEOF bool) (n int, err error) {
|
|
for r, size := rune(0), 0; n < len(src); n += size {
|
|
if r = rune(src[n]); r < utf8.RuneSelf {
|
|
size = 1
|
|
} else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
|
|
// Invalid rune.
|
|
if !atEOF && !utf8.FullRune(src[n:]) {
|
|
err = transform.ErrShortSrc
|
|
} else {
|
|
err = transform.ErrEndOfSpan
|
|
}
|
|
break
|
|
}
|
|
if t(r) != r {
|
|
err = transform.ErrEndOfSpan
|
|
break
|
|
}
|
|
}
|
|
return n, err
|
|
}
|
|
|
|
// Transform implements transform.Transformer.
|
|
func (t mapper) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
|
var replacement rune
|
|
var b [utf8.UTFMax]byte
|
|
|
|
for r, size := rune(0), 0; nSrc < len(src); {
|
|
if r = rune(src[nSrc]); r < utf8.RuneSelf {
|
|
if replacement = t(r); replacement < utf8.RuneSelf {
|
|
if nDst == len(dst) {
|
|
err = transform.ErrShortDst
|
|
break
|
|
}
|
|
dst[nDst] = byte(replacement)
|
|
nDst++
|
|
nSrc++
|
|
continue
|
|
}
|
|
size = 1
|
|
} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
|
|
// Invalid rune.
|
|
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
|
err = transform.ErrShortSrc
|
|
break
|
|
}
|
|
|
|
if replacement = t(utf8.RuneError); replacement == utf8.RuneError {
|
|
if nDst+3 > len(dst) {
|
|
err = transform.ErrShortDst
|
|
break
|
|
}
|
|
dst[nDst+0] = runeErrorString[0]
|
|
dst[nDst+1] = runeErrorString[1]
|
|
dst[nDst+2] = runeErrorString[2]
|
|
nDst += 3
|
|
nSrc++
|
|
continue
|
|
}
|
|
} else if replacement = t(r); replacement == r {
|
|
if nDst+size > len(dst) {
|
|
err = transform.ErrShortDst
|
|
break
|
|
}
|
|
for i := 0; i < size; i++ {
|
|
dst[nDst] = src[nSrc]
|
|
nDst++
|
|
nSrc++
|
|
}
|
|
continue
|
|
}
|
|
|
|
n := utf8.EncodeRune(b[:], replacement)
|
|
|
|
if nDst+n > len(dst) {
|
|
err = transform.ErrShortDst
|
|
break
|
|
}
|
|
for i := 0; i < n; i++ {
|
|
dst[nDst] = b[i]
|
|
nDst++
|
|
}
|
|
nSrc += size
|
|
}
|
|
return
|
|
}
|
|
|
|
// ReplaceIllFormed returns a transformer that replaces all input bytes that are
|
|
// not part of a well-formed UTF-8 code sequence with utf8.RuneError.
|
|
func ReplaceIllFormed() Transformer {
|
|
return Transformer{&replaceIllFormed{}}
|
|
}
|
|
|
|
type replaceIllFormed struct{ transform.NopResetter }
|
|
|
|
func (t replaceIllFormed) Span(src []byte, atEOF bool) (n int, err error) {
|
|
for n < len(src) {
|
|
// ASCII fast path.
|
|
if src[n] < utf8.RuneSelf {
|
|
n++
|
|
continue
|
|
}
|
|
|
|
r, size := utf8.DecodeRune(src[n:])
|
|
|
|
// Look for a valid non-ASCII rune.
|
|
if r != utf8.RuneError || size != 1 {
|
|
n += size
|
|
continue
|
|
}
|
|
|
|
// Look for short source data.
|
|
if !atEOF && !utf8.FullRune(src[n:]) {
|
|
err = transform.ErrShortSrc
|
|
break
|
|
}
|
|
|
|
// We have an invalid rune.
|
|
err = transform.ErrEndOfSpan
|
|
break
|
|
}
|
|
return n, err
|
|
}
|
|
|
|
func (t replaceIllFormed) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
|
for nSrc < len(src) {
|
|
// ASCII fast path.
|
|
if r := src[nSrc]; r < utf8.RuneSelf {
|
|
if nDst == len(dst) {
|
|
err = transform.ErrShortDst
|
|
break
|
|
}
|
|
dst[nDst] = r
|
|
nDst++
|
|
nSrc++
|
|
continue
|
|
}
|
|
|
|
// Look for a valid non-ASCII rune.
|
|
if _, size := utf8.DecodeRune(src[nSrc:]); size != 1 {
|
|
if size != copy(dst[nDst:], src[nSrc:nSrc+size]) {
|
|
err = transform.ErrShortDst
|
|
break
|
|
}
|
|
nDst += size
|
|
nSrc += size
|
|
continue
|
|
}
|
|
|
|
// Look for short source data.
|
|
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
|
err = transform.ErrShortSrc
|
|
break
|
|
}
|
|
|
|
// We have an invalid rune.
|
|
if nDst+3 > len(dst) {
|
|
err = transform.ErrShortDst
|
|
break
|
|
}
|
|
dst[nDst+0] = runeErrorString[0]
|
|
dst[nDst+1] = runeErrorString[1]
|
|
dst[nDst+2] = runeErrorString[2]
|
|
nDst += 3
|
|
nSrc++
|
|
}
|
|
return nDst, nSrc, err
|
|
}
|