diff --git a/bridges/Mailman2Bridge.php b/bridges/Mailman2Bridge.php new file mode 100644 index 00000000..ad0d8110 --- /dev/null +++ b/bridges/Mailman2Bridge.php @@ -0,0 +1,142 @@ + [ + 'url' => [ + 'name' => 'Enter web archive URL', + 'title' => <<<"EOL" + Specify the URL from the archive page where all the archive are listed by month. + EOL + , 'type' => 'text', + 'exampleValue' => 'https://mailman.nginx.org/pipermail/nginx-announce/', + 'required' => true + ], + 'limit' => [ + 'name' => 'Limit', + 'type' => 'number', + 'title' => 'Maximum number of items to return', + 'defaultValue' => 5, + ], + ], + ]; + + public function collectData() + { + $mails = []; + $url = $this->getInput('url'); + $limit = $this->getInput('limit'); + + $html = defaultLinkTo(getSimpleHTMLDOMCached($url, 1800), $url); + + // Fetch archive urls from the frontpage + $archives = []; + foreach ($html->find('tr') as $key => $tr) { + $archiveUrl = $tr->find('a[href$="date.html"]', 0); + $downloadUrl = $tr->find('a[href$=".txt"], a[href$=".txt.gz"]', 0); + $archives[$key] = [ + 'bydate' => $archiveUrl ? $archiveUrl->getAttribute('href') : null, + 'download' => $downloadUrl ? $downloadUrl->getAttribute('href') : null + ]; + } + + foreach ($archives as $archive) { + if (!$archive['bydate']) { + continue; + } + + // Fetch urls to mails + $parent = pathinfo($archive['bydate'], PATHINFO_DIRNAME) . '/'; + $html = defaultLinkTo(getSimpleHTMLDOMCached($archive['bydate'], 1800), $parent); + $links = array_map(function ($val) { + return $val->getAttribute('href'); + }, $html->find('ul', 1)->find('li a[href$=".html"]')); + $mailUrls = array_reverse($links); + + // Parse mbox + $data = getContents($archive['download']); + if (str_ends_with($archive['download'], '.gz')) { + $data = \gzdecode($data, (1024 ** 2) * 25); // 25M + if ($data === false) { + throw new \Exception('Failed to gzdecode'); + } + } + $mboxParts = preg_split('/^From /', $data); + // Drop the first element which is always an empty string + array_shift($mboxParts); + $mboxMails = array_reverse($mboxParts); + foreach ($mboxMails as $index => $content) { + // Match Urls with contents from txt files. + // Urls cannot be reconstructed from the txt content. + $mails[] = [ + 'url' => $mailUrls[$index], + 'content' => $content + ]; + } + if (count($mails) > $limit) { + break; + } + } + + $pluck = function ($header, $mail) { + // Not necessary to escape the header here + $pattern = sprintf('/(?<=%s:).*$/m', $header); + if (preg_match($pattern, $mail, $m)) { + return trim(\mb_decode_mimeheader($m[0])); + } + return null; + }; + foreach (array_slice($mails, 0, $limit) as $mail) { + $item = []; + $item['uid'] = $pluck('Message-ID', $mail['content']); + $item['uri'] = $mail['url']; + $item['title'] = $pluck('Subject', $mail['content']); + $item['author'] = preg_replace('/\sat\s/', '@', $pluck('From', $mail['content'])); + $item['timestamp'] = $pluck('Date', $mail['content']); + $item['content'] = nl2br(self::render($mail['content'])); + $this->items[] = $item; + } + } + + /** + * Parse mbox mail. Render some useful html. + * + * Based on https://gist.github.com/jbroadway/2836900 + */ + private static function render($text) + { + $rules = [ + '/[\s\S]*?^Message-ID:[\s\S]*?>\n\n/m' => '', // Metadata + '/-+\s+next part\s+-+[\s\S]+?(?=^$|\Z)/m' => '', // next part + '/(? '$0', // links + '/(\w+) <(.*) at (.*)>/' => '$1', // emails + '/(\*|\*\*|__)(.*?)\1/' => '\2', // bold + // blockquotes + '/(.*)\s+((?:^>.*+\n)+)/m' => function ($regs) { + return sprintf( + '
%s
%s
', + $regs[1], + preg_replace('/^>/m', '', $regs[2]) + ); + }, + ]; + + $text = "\n" . $text . "\n"; + foreach ($rules as $regex => $replacement) { + if (is_callable($replacement)) { + $text = preg_replace_callback($regex, $replacement, $text); + } else { + $text = preg_replace($regex, $replacement, $text); + } + } + return trim($text); + } +}