Initial commit

This commit is contained in:
Dusan Kasan 2017-04-10 22:44:22 +02:00
commit 17e497ea98
6 changed files with 750 additions and 0 deletions

3
CHANGELOG.md Normal file
View File

@ -0,0 +1,3 @@
# Changelog
## No versions tagged yet

19
CONTRIBUTING.md Normal file
View File

@ -0,0 +1,19 @@
## How to contribute
This project is open to contribution from anyone, as long as you cover your changes with tests. Your pull requests will be merged after your code passe CI and manual code review.
Every change merges to master. No development is done in other branches.
## Typical contribution use case
- You need a feature that is not implemented yet
- Search for open/closed issues relating to what you need
- If you don't find anything, create new issue
- Fork this repository and create fix/feature in the fork
- Write tests for your change
- If you changed API, document the change in README
- Create pull request, describe what you did
- Wait for CI to verify you didn't break anything
- If you did, rewrite it
- If CI passes, wait for manual review by repo's owner
- Your pull request will be merged into master

21
LICENSE.md Normal file
View File

@ -0,0 +1,21 @@
The MIT License (MIT)
Copyright (c) 2017 Dusan Kasan
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

29
README.md Normal file
View File

@ -0,0 +1,29 @@
# Parsemail - simple email parsing Go library
Simple usage:
```go
var reader io.Reader // this reads an email message
email, err := parsemail.Parse(reader) // returns Email struct and error
if err != nil {
// handle error
}
fmt.Println(email.Subject()) // not a method, parsed from header on demand
fmt.Println(email.Cc()) // not a method, parsed from header on demand
fmt.Println(email.HTMLBody) // this is not a method
```
## This library is WIP.
It is missing some tests, and needs more work. Use at your own discretion.
## TODO
- CI
- Readme with use cases
- More tests for 100% coverage
- email address type => getEmail, getName
- quoted§ text?

348
parsemail.go Normal file
View File

@ -0,0 +1,348 @@
package parsemail
import (
"net/mail"
"io"
"strings"
"mime/multipart"
"mime"
"fmt"
"errors"
"io/ioutil"
"time"
"encoding/base64"
"bytes"
)
func Parse(r io.Reader) (Email, error) {
email := Email{}
msg, err := mail.ReadMessage(r);
if err != nil {
return email, err
}
var body []byte
_,err = msg.Body.Read(body);
if err != nil {
return email, err
}
email.Header, err = decodeHeaderMime(msg.Header)
if err != nil {
return email, err
}
mediaType, params, err := mime.ParseMediaType(msg.Header.Get("Content-Type"))
if err != nil {
return email, err
}
if mediaType == "" {
return email, errors.New("No top level mime type specified")
} else if strings.HasPrefix(mediaType, "multipart/mixed") {
email.TextBody, email.HTMLBody, email.Attachments, email.EmbeddedFiles, err = parseMultipartMixed(msg.Body, params["boundary"])
if err != nil {
return email, err
}
} else if strings.HasPrefix(mediaType, "multipart/alternative") {
email.TextBody, email.HTMLBody, email.EmbeddedFiles, err = parseMultipartAlternative(msg.Body, params["boundary"])
if err != nil {
return email, err
}
} else if strings.HasPrefix(mediaType, "text/plain") {
message, _ := ioutil.ReadAll(msg.Body)
email.TextBody = strings.TrimSuffix(string(message[:]), "\n")
} else if strings.HasPrefix(mediaType, "text/html") {
message, _ := ioutil.ReadAll(msg.Body)
email.HTMLBody = strings.TrimSuffix(string(message[:]), "\n")
} else {
return email, errors.New(fmt.Sprintf("Unknown top level mime type: %s", mediaType))
}
return email, nil
}
func decodeMimeSentence(s string) (string, error) {
result := []string{}
ss := strings.Split(s, " ")
for _, word := range ss {
dec := new(mime.WordDecoder)
w, err := dec.Decode(word)
if err != nil {
if len(result) == 0 {
w = word
} else {
w = " " + word
}
}
result = append(result, w)
}
return strings.Join(result, ""), nil
}
func parseMultipartAlternative(msg io.Reader, boundary string) (textBody, htmlBody string, embeddedFiles []EmbeddedFile, err error) {
pmr := multipart.NewReader(msg, boundary)
for {
pp, err := pmr.NextPart()
if err == io.EOF {
break
}
if err != nil {
return textBody, htmlBody, embeddedFiles, err
}
ppMediaType, ppParams, err := mime.ParseMediaType(pp.Header.Get("Content-Type"))
if ppMediaType == "text/plain" {
ppContent, err := ioutil.ReadAll(pp)
if err != nil {
return textBody, htmlBody, embeddedFiles, err
}
textBody += strings.TrimSuffix(string(ppContent[:]), "\n")
} else if ppMediaType == "text/html" {
ppContent, err := ioutil.ReadAll(pp)
if err != nil {
return textBody, htmlBody, embeddedFiles, err
}
htmlBody += strings.TrimSuffix(string(ppContent[:]), "\n")
} else if ppMediaType == "multipart/related" {
var tb, hb string
var ef []EmbeddedFile
tb, hb, ef, err = parseMultipartAlternative(pp, ppParams["boundary"])
htmlBody += hb
textBody += tb
embeddedFiles = append(embeddedFiles, ef...)
} else if pp.Header.Get("Content-Transfer-Encoding") != "" {
reference, err := decodeMimeSentence(pp.Header.Get("Content-Id"));
if err != nil {
return textBody, htmlBody, embeddedFiles, err
}
reference = strings.Trim(reference, "<>")
decoded, err := decodePartData(pp)
if err != nil {
return textBody, htmlBody, embeddedFiles, err
}
embeddedFiles = append(embeddedFiles, EmbeddedFile{reference, decoded})
} else {
return textBody, htmlBody, embeddedFiles, errors.New(fmt.Sprintf("Can't process multipart/alternative inner mime type: %s", ppMediaType))
}
}
return textBody, htmlBody, embeddedFiles, err
}
func parseMultipartMixed(msg io.Reader, boundary string) (textBody, htmlBody string, attachments []Attachment, embeddedFiles []EmbeddedFile, err error) {
mr := multipart.NewReader(msg, boundary)
for {
p, err := mr.NextPart()
if err == io.EOF {
break
}
if err != nil {
return textBody, htmlBody, attachments, embeddedFiles, err
}
pMediaType, pParams, err := mime.ParseMediaType(p.Header.Get("Content-Type"))
if err != nil {
return textBody, htmlBody, attachments, embeddedFiles, err
}
if strings.HasPrefix(pMediaType, "multipart/alternative") {
textBody, htmlBody, embeddedFiles, err = parseMultipartAlternative(p, pParams["boundary"])
if err != nil {
return textBody, htmlBody, attachments, embeddedFiles, err
}
} else if p.FileName() != "" {
filename, err := decodeMimeSentence(p.FileName());
if err != nil {
return textBody, htmlBody, attachments, embeddedFiles, err
}
decoded, err := decodePartData(p)
if err != nil {
return textBody, htmlBody, attachments, embeddedFiles, err
}
attachments = append(attachments, Attachment{filename, decoded})
} else {
return textBody, htmlBody, attachments, embeddedFiles, errors.New(fmt.Sprintf("Unknown multipart/mixed nested mime type: %s", pMediaType))
}
}
return textBody, htmlBody, attachments, embeddedFiles, err
}
func decodeHeaderMime(header mail.Header) (mail.Header, error) {
parsedHeader := map[string][]string{}
for headerName, headerData := range header {
parsedHeaderData := []string{}
for _, headerValue := range headerData {
decodedHeaderValue, err := decodeMimeSentence(headerValue)
if err != nil {
return mail.Header{}, err
}
parsedHeaderData = append(parsedHeaderData, decodedHeaderValue)
}
parsedHeader[headerName] = parsedHeaderData
}
return mail.Header(parsedHeader), nil
}
func decodePartData(part *multipart.Part) (io.Reader, error) {
encoding := part.Header.Get("Content-Transfer-Encoding")
if encoding == "base64" {
dr := base64.NewDecoder(base64.StdEncoding, part)
dd, err := ioutil.ReadAll(dr)
if err != nil {
return nil, err
}
return bytes.NewReader(dd), nil
} else {
return nil, errors.New(fmt.Sprintf("Unknown encoding: %s", encoding))
}
}
type Attachment struct {
Filename string
Data io.Reader
}
type EmbeddedFile struct {
CID string
Data io.Reader
}
type Email struct {
Header mail.Header
HTMLBody string
TextBody string
Attachments []Attachment
EmbeddedFiles []EmbeddedFile
}
func (e *Email) Subject() string {
return e.Header.Get("Subject")
}
func (e *Email) Sender() string {
return e.Header.Get("Sender")
}
func (e *Email) From() []string {
result := []string{}
for _, v := range(strings.Split(e.Header.Get("From"), ",")) {
t := strings.Trim(v, " ")
if t != "" {
result = append(result, t)
}
}
return result
}
func (e *Email) To() []string {
result := []string{}
for _, v := range(strings.Split(e.Header.Get("To"), ",")) {
t := strings.Trim(v, " ")
if t != "" {
result = append(result, t)
}
}
return result
}
func (e *Email) Cc() []string {
result := []string{}
for _, v := range(strings.Split(e.Header.Get("Cc"), ",")) {
t := strings.Trim(v, " ")
if t != "" {
result = append(result, t)
}
}
return result
}
func (e *Email) Bcc() []string {
result := []string{}
for _, v := range(strings.Split(e.Header.Get("Bcc"), ",")) {
t := strings.Trim(v, " ")
if t != "" {
result = append(result, t)
}
}
return result
}
func (e *Email) ReplyTo() []string {
result := []string{}
for _, v := range(strings.Split(e.Header.Get("Reply-To"), ",")) {
t := strings.Trim(v, " ")
if t != "" {
result = append(result, t)
}
}
return result
}
func (e *Email) Date() (time.Time, error) {
t, err := time.Parse(time.RFC1123Z, e.Header.Get("Date"))
if err == nil {
return t, err
}
return time.Parse("Mon, 2 Jan 2006 15:04:05 -0700", e.Header.Get("Date"))
}
func (e *Email) MessageID() string {
return strings.Trim(e.Header.Get("Message-ID"), "<>")
}
func (e *Email) InReplyTo() []string {
result := []string{}
for _, v := range(strings.Split(e.Header.Get("In-Reply-To"), " ")) {
if v != "" {
result = append(result, strings.Trim(v, "<> "))
}
}
return result
}
func (e *Email) References() []string {
result := []string{}
for _, v := range(strings.Split(e.Header.Get("References"), " ")) {
if v != "" {
result = append(result, strings.Trim(v, "<> "))
}
}
return result
}

330
parsemail_test.go Normal file
View File

@ -0,0 +1,330 @@
package parsemail_test
import (
"testing"
"github.com/DusanKasan/parsemail"
"strings"
"time"
"net/mail"
"encoding/base64"
"io/ioutil"
)
func TestParseEmail(t *testing.T) {
var testData = []struct{
mailData string
subject string
from []string
sender string
to []string
replyTo []string
cc []string
bcc []string
messageID string
inReplyTo []string
references []string
date time.Time
htmlBody string
textBody string
attachments []attachmentData
embeddedFiles []embeddedFileData
headerCheck func (mail.Header, *testing.T)
}{
{
mailData: Data1,
subject: "Test Subject 1",
from: []string{"Peter Paholík <peter.paholik@gmail.com>"},
to: []string{"dusan@kasan.sk"},
messageID: "CACtgX4kNXE7T5XKSKeH_zEcfUUmf2vXVASxYjaaK9cCn-3zb_g@mail.gmail.com",
date: parseDate("Fri, 07 Apr 2017 09:17:26 +0200"),
htmlBody: "<div dir=\"ltr\"><br></div>",
attachments: []attachmentData{
{
filename: "Peter Paholík 1 4 2017 2017-04-07.pdf",
base64data: "JVBERi0xLjQNCiW1tbW1DQoxIDAgb2JqDQo8PC9UeXBlL0NhdGFsb2cvUGFnZXMgMiAwIFIvTGFuZyhlbi1VUykgL1N0cnVjdFRyZWVSb290IDY3IDAgUi9NYXJrSW5mbzw8L01hcmtlZCB0cnVlPj4vT3V0cHV0SW50ZW50c1s8PC9UeXBlL091dHB1dEludGVudC9TL0dUU19QREZBMS9PdXRwdXRDb25kZXYgMzk1MzYyDQo+Pg0Kc3RhcnR4cmVmDQo0MTk4ODUNCiUlRU9GDQo=",
},
},
},
{
mailData: Data2,
subject: "Re: Test Subject 2",
from: []string{"Sender Man <sender@domain.com>"},
to: []string{"info@receiver.com"},
cc: []string{"Cc Man <ccman@gmail.com>"},
messageID: "0e9a21b4-01dc-e5c1-dcd6-58ce5aa61f4f@receiver.com",
inReplyTo: []string{"9ff38d03-c4ab-89b7-9328-e99d5e24e3ba@receiver.eu"},
references: []string{"2f6b7595-c01e-46e5-42bc-f263e1c4282d@receiver.com", "9ff38d03-c4ab-89b7-9328-e99d5e24e3ba@domain.com"},
date: parseDate("Fri, 07 Apr 2017 12:59:55 +0200"),
htmlBody: `<html>data<img src="part2.9599C449.04E5EC81@develhell.com"/></html>`,
textBody: `First level
> Second level
>> Third level
>
`,
embeddedFiles: []embeddedFileData{
{
cid: "part2.9599C449.04E5EC81@develhell.com",
base64data: "iVBORw0KGgoAAAANSUhEUgAAAQEAAAAYCAIAAAB1IN9NAAAACXBIWXMAAAsTAAALEwEAmpwYYKUKF+Os3baUndC0pDnwNAmLy1SUr2Gw0luxQuV/AwC6cEhVV5VRrwAAAABJRU5ErkJggg==",
},
},
},
}
for _, td := range testData {
e, err := parsemail.Parse(strings.NewReader(td.mailData))
if err != nil {
t.Error(err)
}
if td.subject != e.Subject() {
t.Errorf("Wrong subject. Expected: %s, Got: %s", td.subject, e.Subject())
}
if td.sender != e.Sender() {
t.Errorf("Wrong sender. Expected: %s, Got: %s", td.sender, e.Sender())
}
if !assertSliceEq(td.from, e.From()) {
t.Errorf("Wrong from. Expected: %s, Got: %s", td.from, e.From())
}
if !assertSliceEq(td.inReplyTo, e.InReplyTo()) {
t.Errorf("Wrong in reply to. Expected: %s, Got: %s", td.inReplyTo, e.InReplyTo())
}
if !assertSliceEq(td.references, e.References()) {
t.Errorf("Wrong references. Expected: %s, Got: %s", td.references, e.References())
}
if !assertSliceEq(td.to, e.To()) {
t.Errorf("Wrong to. Expected: %s, Got: %s", td.to, e.To())
}
if !assertSliceEq(td.replyTo, e.ReplyTo()) {
t.Errorf("Wrong reply to. Expected: %s, Got: %s", td.replyTo, e.ReplyTo())
}
if !assertSliceEq(td.cc, e.Cc()) {
t.Errorf("Wrong cc. Expected: %s, Got: %s", td.cc, e.Cc())
}
if !assertSliceEq(td.bcc, e.Bcc()) {
t.Errorf("Wrong cc. Expected: %s, Got: %s", td.cc, e.Cc())
}
date, err := e.Date()
if err != nil {
t.Error(err)
} else if td.date != date {
t.Errorf("Wrong date. Expected: %v, Got: %v", td.date, date)
}
if td.htmlBody != e.HTMLBody {
t.Errorf("Wrong html body. Expected: '%s', Got: '%s'", td.htmlBody, e.HTMLBody)
}
if td.textBody != e.TextBody {
t.Errorf("Wrong text body. Expected: '%s', Got: '%s'", td.textBody, e.TextBody)
}
if td.messageID != e.MessageID() {
t.Errorf("Wrong messageID. Expected: '%s', Got: '%s'", td.messageID, e.MessageID())
}
if len(td.attachments) != len(e.Attachments) {
t.Errorf("Incorrect number of attachments! Expected: %v, Got: %v.", len(td.attachments), len(e.Attachments))
} else {
attachs := e.Attachments[:]
for _, ad := range(td.attachments) {
found := false
for i, ra := range(attachs) {
b, err := ioutil.ReadAll(ra.Data)
if err != nil {
t.Error(err)
}
encoded := base64.StdEncoding.EncodeToString(b)
if ra.Filename == ad.filename && encoded == ad.base64data {
found = true
attachs = append(attachs[:i], attachs[i+1:]...)
}
}
if !found {
t.Errorf("Attachment not found: %s", ad.filename)
}
}
if len(attachs) != 0 {
t.Errorf("Email contains %v unexpected attachments: %v", len(attachs), attachs)
}
}
if len(td.embeddedFiles) != len(e.EmbeddedFiles) {
t.Errorf("Incorrect number of embedded files! Expected: %s, Got: %s.", len(td.embeddedFiles), len(e.EmbeddedFiles))
} else {
embeds := e.EmbeddedFiles[:]
for _, ad := range(td.embeddedFiles) {
found := false
for i, ra := range(embeds) {
b, err := ioutil.ReadAll(ra.Data)
if err != nil {
t.Error(err)
}
encoded := base64.StdEncoding.EncodeToString(b)
if ra.CID == ad.cid && encoded == ad.base64data {
found = true
embeds = append(embeds[:i], embeds[i+1:]...)
}
}
if !found {
t.Errorf("Embedded file not found: %s", ad.cid)
}
}
if len(embeds) != 0 {
t.Errorf("Email contains %v unexpected embedded files: %v", len(embeds), embeds)
}
}
}
}
func parseDate(in string) time.Time {
out, err := time.Parse(time.RFC1123Z, in)
if err != nil {
panic(err)
}
return out
}
type attachmentData struct{
filename string
base64data string
}
type embeddedFileData struct{
cid string
base64data string
}
func assertSliceEq(a, b []string) bool {
if len(a) == len(b) && len(a) == 0 {
return true
}
if a == nil && b == nil {
return true;
}
if a == nil || b == nil {
return false;
}
if len(a) != len(b) {
return false
}
for i := range a {
if a[i] != b[i] {
return false
}
}
return true
}
var Data1 = `From: =?UTF-8?Q?Peter_Pahol=C3=ADk?= <peter.paholik@gmail.com>
Date: Fri, 7 Apr 2017 09:17:26 +0200
Message-ID: <CACtgX4kNXE7T5XKSKeH_zEcfUUmf2vXVASxYjaaK9cCn-3zb_g@mail.gmail.com>
Subject: Test Subject 1
To: dusan@kasan.sk
Content-Type: multipart/mixed; boundary=f403045f1dcc043a44054c8e6bbf
--f403045f1dcc043a44054c8e6bbf
Content-Type: multipart/alternative; boundary=f403045f1dcc043a3f054c8e6bbd
--f403045f1dcc043a3f054c8e6bbd
Content-Type: text/plain; charset=UTF-8
--f403045f1dcc043a3f054c8e6bbd
Content-Type: text/html; charset=UTF-8
<div dir="ltr"><br></div>
--f403045f1dcc043a3f054c8e6bbd--
--f403045f1dcc043a44054c8e6bbf
Content-Type: application/pdf;
name="=?UTF-8?Q?Peter_Paholi=CC=81k_1?=
=?UTF-8?Q?_4_2017_2017=2D04=2D07=2Epdf?="
Content-Disposition: attachment;
filename="=?UTF-8?Q?Peter_Paholi=CC=81k_1?=
=?UTF-8?Q?_4_2017_2017=2D04=2D07=2Epdf?="
Content-Transfer-Encoding: base64
X-Attachment-Id: f_j17i0f0d0
JVBERi0xLjQNCiW1tbW1DQoxIDAgb2JqDQo8PC9UeXBlL0NhdGFsb2cvUGFnZXMgMiAwIFIvTGFu
Zyhlbi1VUykgL1N0cnVjdFRyZWVSb290IDY3IDAgUi9NYXJrSW5mbzw8L01hcmtlZCB0cnVlPj4v
T3V0cHV0SW50ZW50c1s8PC9UeXBlL091dHB1dEludGVudC9TL0dUU19QREZBMS9PdXRwdXRDb25k
ZXYgMzk1MzYyDQo+Pg0Kc3RhcnR4cmVmDQo0MTk4ODUNCiUlRU9GDQo=
--f403045f1dcc043a44054c8e6bbf--
`
var Data2 = `Subject: Re: Test Subject 2
To: info@receiver.com
References: <2f6b7595-c01e-46e5-42bc-f263e1c4282d@receiver.com>
<9ff38d03-c4ab-89b7-9328-e99d5e24e3ba@domain.com>
Cc: Cc Man <ccman@gmail.com>
From: Sender Man <sender@domain.com>
Message-ID: <0e9a21b4-01dc-e5c1-dcd6-58ce5aa61f4f@receiver.com>
Date: Fri, 7 Apr 2017 12:59:55 +0200
User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:45.0)
Gecko/20100101 Thunderbird/45.8.0
MIME-Version: 1.0
In-Reply-To: <9ff38d03-c4ab-89b7-9328-e99d5e24e3ba@receiver.eu>
Content-Type: multipart/alternative;
boundary="------------C70C0458A558E585ACB75FB4"
This is a multi-part message in MIME format.
--------------C70C0458A558E585ACB75FB4
Content-Type: text/plain; charset=utf-8; format=flowed
Content-Transfer-Encoding: 8bit
First level
> Second level
>> Third level
>
--------------C70C0458A558E585ACB75FB4
Content-Type: multipart/related;
boundary="------------5DB4A1356834BB602A5F88B2"
--------------5DB4A1356834BB602A5F88B2
Content-Type: text/html; charset=utf-8
Content-Transfer-Encoding: 8bit
<html>data<img src="part2.9599C449.04E5EC81@develhell.com"/></html>
--------------5DB4A1356834BB602A5F88B2
Content-Type: image/png
Content-Transfer-Encoding: base64
Content-ID: <part2.9599C449.04E5EC81@develhell.com>
iVBORw0KGgoAAAANSUhEUgAAAQEAAAAYCAIAAAB1IN9NAAAACXBIWXMAAAsTAAALEwEAmpwY
YKUKF+Os3baUndC0pDnwNAmLy1SUr2Gw0luxQuV/AwC6cEhVV5VRrwAAAABJRU5ErkJggg==
--------------5DB4A1356834BB602A5F88B2
--------------C70C0458A558E585ACB75FB4--
`