1
0
mirror of https://github.com/danog/MadelineProto.git synced 2024-11-30 04:08:59 +01:00

Switch to custom MarkdownV2 parser

This commit is contained in:
Daniil Gentili 2023-07-15 16:02:32 +02:00
parent f4688e4954
commit e81f4be805
Signed by: danog
GPG Key ID: 8C1BE3B34B230CA7
7 changed files with 343 additions and 64 deletions

View File

@ -298,11 +298,6 @@ trait ResponseHandler
case 500:
case -500:
case -503:
if ($response['error_message'] === 'MSG_WAIT_FAILED') {
$this->call_queue[$request->getQueueId()] = [];
$this->methodRecall(message_id: $request->getMsgId(), postpone: true);
return null;
}
if ((($response['error_code'] === -503 || $response['error_message'] === '-503') && !\in_array($request->getConstructor(), ['messages.getBotCallbackAnswer', 'messages.getInlineBotResults'], true))
|| (\in_array($response['error_message'], ['MSGID_DECREASE_RETRY', 'HISTORY_GET_FAILED', 'RPC_CONNECT_FAILED', 'RPC_CALL_FAIL', 'RPC_MCGET_FAIL', 'PERSISTENT_TIMESTAMP_OUTDATED', 'RPC_MCGET_FAIL', 'no workers running', 'No workers running'], true))) {
EventLoop::delay(1.0, fn () => $this->methodRecall(message_id: $request->getMsgId()));

View File

@ -22,6 +22,7 @@ namespace danog\MadelineProto;
use danog\MadelineProto\TL\Conversion\DOMEntities;
use danog\MadelineProto\TL\Conversion\Extension;
use danog\MadelineProto\TL\Conversion\MarkdownEntities;
use Parsedown;
use Webmozart\Assert\Assert;
@ -107,11 +108,11 @@ abstract class StrTools extends Extension
*
* @see https://docs.madelineproto.xyz/API_docs/methods/messages.sendMessage.html#usage-of-parse_mode
*
* @return \danog\MadelineProto\TL\Conversion\DOMEntities Object containing message and entities
* @return \danog\MadelineProto\TL\Conversion\MarkdownEntities Object containing message and entities
*/
public static function markdownToMessageEntities(string $markdown): \danog\MadelineProto\TL\Conversion\DOMEntities
public static function markdownToMessageEntities(string $markdown): \danog\MadelineProto\TL\Conversion\MarkdownEntities
{
return new DOMEntities(Parsedown::instance()->line($markdown));
return new MarkdownEntities($markdown);
}
/**
* Convert a message and a set of entities to HTML.

View File

@ -415,7 +415,7 @@ trait BotAPI
* @param array $arguments Arguments
* @internal
*/
public function parseMode(array $arguments): array
public static function parseMode(array $arguments): array
{
if (($arguments['message'] ?? '') === '' || !isset($arguments['parse_mode'])) {
return $arguments;
@ -430,10 +430,11 @@ trait BotAPI
$arguments['parse_mode'] = \str_replace('textParseMode', '', $arguments['parse_mode']['_']);
}
if (\stripos($arguments['parse_mode'], 'markdown') !== false) {
$arguments['message'] = Parsedown::instance()->line($arguments['message']);
$arguments['parse_mode'] = 'HTML';
}
if (\stripos($arguments['parse_mode'], 'html') !== false) {
$entities = new MarkdownEntities($arguments['message']);
$arguments['message'] = $entities->message;
$arguments['entities'] = \array_merge($arguments['entities'] ?? [], $entities->entities);
unset($arguments['parse_mode']);
} elseif (\stripos($arguments['parse_mode'], 'html') !== false) {
$entities = new DOMEntities($arguments['message']);
$arguments['message'] = $entities->message;
$arguments['entities'] = \array_merge($arguments['entities'] ?? [], $entities->entities);
@ -470,7 +471,7 @@ trait BotAPI
if (\trim($cur) !== '') {
$multiple_args[] = [
...$multiple_args_base,
'message' => $cur
'message' => $cur,
];
}
$cur = $vv;
@ -481,7 +482,7 @@ trait BotAPI
if (\trim($cur) !== '') {
$multiple_args[] = [
...$multiple_args_base,
'message' => $cur
'message' => $cur,
];
}
@ -500,35 +501,17 @@ trait BotAPI
$newentity['length'] = $entity['length'] - (StrTools::mbStrlen($multiple_args[$i]['message']) - $entity['offset']);
$entity['length'] = StrTools::mbStrlen($multiple_args[$i]['message']) - $entity['offset'];
$offset += $entity['length'];
//StrTools::mbStrlen($multiple_args[$i]['message']);
$newentity['offset'] = $offset;
$prev_length = StrTools::mbStrlen($multiple_args[$i]['message']);
$multiple_args[$i]['message'] = \rtrim($multiple_args[$i]['message']);
$diff = $prev_length - StrTools::mbStrlen($multiple_args[$i]['message']);
if ($diff) {
$entity['length'] -= $diff;
foreach ($args['entities'] as $key => &$eentity) {
if ($key > $k) {
$eentity['offset'] -= $diff;
}
}
}
$orig = $multiple_args[$i]['message'];
$trimmed = rtrim($orig);
$diff = StrTools::mbStrlen($orig) - StrTools::mbStrlen($trimmed);
$entity['length'] -= $diff;
$multiple_args[$i]['message'] = $trimmed;
$multiple_args[$i]['entities'][] = $entity;
$i++;
$entity = $newentity;
continue;
}
$prev_length = StrTools::mbStrlen($multiple_args[$i]['message']);
$multiple_args[$i]['message'] = \rtrim($multiple_args[$i]['message']);
$diff = $prev_length - StrTools::mbStrlen($multiple_args[$i]['message']);
if ($diff) {
$entity['length'] -= $diff;
foreach ($args['entities'] as $key => &$eentity) {
if ($key > $k) {
$eentity['offset'] -= $diff;
}
}
}
$multiple_args[$i]['entities'][] = $entity;
break;
} while (true);

View File

@ -15,7 +15,7 @@ use Throwable;
/**
* Class that converts HTML to a message + set of entities.
*/
final class DOMEntities
final class DOMEntities extends Entities
{
/** Converted entities */
public readonly array $entities;
@ -52,6 +52,11 @@ final class DOMEntities
$message .= "\n";
return 1;
}
$length = 0;
if ($node->nodeName === 'li') {
$message .= "- ";
$length += 2;
}
/** @var DOMElement $node */
$entity = match ($node->nodeName) {
's', 'strike', 'del' =>['_' => 'messageEntityStrike'],
@ -64,10 +69,9 @@ final class DOMEntities
'pre' => ['_' => 'messageEntityPre', 'language' => $node->getAttribute('language') ?? ''],
'tg-emoji' => ['_' => 'messageEntityCustomEmoji', 'document_id' => (int) $node->getAttribute('emoji-id')],
'emoji' => ['_' => 'messageEntityCustomEmoji', 'document_id' => (int) $node->getAttribute('id')],
'a' => self::handleA($node),
'a' => self::handleLink($node->getAttribute('href')),
default => null,
};
$length = 0;
foreach ($node->childNodes as $sub) {
$length += self::parseNode($sub, $offset+$length, $message, $entities);
}
@ -91,16 +95,4 @@ final class DOMEntities
}
return $length;
}
private static function handleA(DOMElement $node): array
{
$href = $node->getAttribute('href');
if (\preg_match('|^mention:(.+)|', $href, $matches) || \preg_match('|^tg://user\\?id=(.+)|', $href, $matches)) {
return ['_' => 'inputMessageEntityMentionName', 'user_id' => $matches[1]];
}
if (\preg_match('|^emoji:(\d+)$|', $href, $matches) || \preg_match('|^tg://emoji\\?id=(.+)|', $href, $matches)) {
return ['_' => 'messageEntityCustomEmoji', 'document_id' => (int) $matches[1]];
}
return ['_' => 'messageEntityTextUrl', 'url' => $href];
}
}

View File

@ -0,0 +1,32 @@
<?php
declare(strict_types=1);
namespace danog\MadelineProto\TL\Conversion;
use danog\MadelineProto\Exception;
use danog\MadelineProto\StrTools;
use DOMDocument;
use DOMElement;
use DOMNode;
use DOMText;
use Throwable;
/**
* Class that converts HTML or markdown to a message + set of entities.
*
* @internal
*/
abstract class Entities
{
protected static function handleLink(string $href): array
{
if (\preg_match('|^mention:(.+)|', $href, $matches) || \preg_match('|^tg://user\\?id=(.+)|', $href, $matches)) {
return ['_' => 'inputMessageEntityMentionName', 'user_id' => $matches[1]];
}
if (\preg_match('|^emoji:(\d+)$|', $href, $matches) || \preg_match('|^tg://emoji\\?id=(.+)|', $href, $matches)) {
return ['_' => 'messageEntityCustomEmoji', 'document_id' => (int) $matches[1]];
}
return ['_' => 'messageEntityTextUrl', 'url' => $href];
}
}

View File

@ -0,0 +1,191 @@
<?php
declare(strict_types=1);
namespace danog\MadelineProto\TL\Conversion;
use AssertionError;
use danog\MadelineProto\Exception;
use danog\MadelineProto\StrTools;
use DOMDocument;
use DOMElement;
use DOMNode;
use DOMText;
use Throwable;
/**
* Class that converts Markdown to a message + set of entities.
*/
final class MarkdownEntities extends Entities
{
/** Converted entities */
public readonly array $entities;
/** Converted message */
public readonly string $message;
/**
* @param string $markdown Markdown to parse
*/
public function __construct(string $markdown)
{
$markdown = str_replace("\r\n", "\n", $markdown);
try {
$message = '';
$messageLen = 0;
$entities = [];
$offset = 0;
$stack = [];
while ($offset < strlen($markdown)) {
$len = strcspn($markdown, '*_~`[]|\\', $offset);
$piece = substr($markdown, $offset, $len);
$offset += $len;
if ($offset === strlen($markdown)) {
$message .= $piece;
break;
}
$char = $markdown[$offset++];
$next = $markdown[$offset] ?? '';
if ($char === '\\') {
$message .= $piece.$next;
$messageLen += StrTools::mbStrlen($piece)+1;
$offset++;
continue;
}
if ($char === '_' && $next === '_') {
$offset++;
$char = '__';
} elseif ($char === '|') {
if ($next === '|') {
$offset++;
$char = '||';
} else {
$message .= $piece.$char;
$messageLen += StrTools::mbStrlen($piece)+1;
continue;
}
} elseif ($char === '[') {
$char = '](';
} elseif ($char === ']') {
if (!$stack || end($stack)[0] !== '](') {
$message .= $piece.$char;
$messageLen += StrTools::mbStrlen($piece)+1;
continue;
}
if ($next !== '(') {
throw new AssertionError("( expected @ pos $offset!");
}
$offset++;
$char = "](";
} elseif ($char === '`' && $next === '`' && ($markdown[$offset+1] ?? '') === '`') {
$message .= $piece;
$messageLen += StrTools::mbStrlen($piece);
$offset += 2;
$langLen = strcspn($markdown, "\n ", $offset);
$language = substr($markdown, $offset, $langLen);
$offset += $langLen;
if ($markdown[$offset] === "\n") {
$offset++;
}
$posClose = $offset;
while (($posClose = strpos($markdown, '```', $posClose)) !== false) {
if ($markdown[$posClose-1] === '\\') {
$posClose++;
continue;
}
break;
}
if ($posClose === false) {
throw new AssertionError("Unclosed ``` opened @ pos $offset!");
}
$start = $messageLen;
$message .= $piece = substr($markdown, $offset, $posClose-$offset);
$pieceLen = StrTools::mbStrlen($piece);
$messageLen += $pieceLen;
for ($x = \strlen($piece)-1; $x >= 0; $x--) {
if (!(
$piece[$x] === ' '
|| $piece[$x] === "\r"
|| $piece[$x] === "\n"
)) {
break;
}
$pieceLen--;
}
if ($pieceLen > 0) {
$entities []= [
'_' => 'messageEntityPre',
'language' => $language,
'offset' => $start,
'length' => $pieceLen
];
}
$offset = $posClose+3;
continue;
}
if ($stack && end($stack)[0] === $char) {
[, $start] = array_pop($stack);
if ($char === '](') {
$posClose = $offset;
while (($posClose = strpos($markdown, ')', $posClose)) !== false) {
if ($markdown[$posClose-1] === '\\') {
$posClose++;
continue;
}
break;
}
if ($posClose === false) {
throw new AssertionError("Unclosed ) opened @ pos $offset!");
}
$entity = self::handleLink(substr($markdown, $offset, $posClose-$offset));
$offset = $posClose+1;
} else {
$entity = match ($char) {
'*' => ['_' => 'messageEntityBold'],
'_' => ['_' => 'messageEntityItalic'],
'__' => ['_' => 'messageEntityUnderline'],
'`' => ['_' => 'messageEntityCode'],
'~' => ['_' => 'messageEntityStrike'],
'||' => ['_' => 'messageEntitySpoiler'],
default => throw new AssertionError("Unknown char $char @ pos $offset!")
};
}
$message .= $piece;
$messageLen += StrTools::mbStrlen($piece);
$lengthReal = $messageLen-$start;
for ($x = strlen($message)-1; $x >= 0; $x--) {
if (!(
$message[$x] === ' '
|| $message[$x] === "\r"
|| $message[$x] === "\n"
)) {
break;
}
$lengthReal--;
}
if ($lengthReal > 0) {
$entities []= $entity + ['offset' => $start, 'length' => $lengthReal];
}
} else {
$message .= $piece;
$messageLen += StrTools::mbStrlen($piece);
$stack []= [$char, $messageLen];
}
}
$this->message = $message;
$this->entities = $entities;
} catch (Throwable $e) {
throw new Exception("An error occurred while parsing $markdown: {$e->getMessage()}", $e->getCode());
}
}
}

View File

@ -39,7 +39,7 @@ class EntitiesTest extends MadelineTestCase
if (\strtolower($mode) === 'html') {
$this->assertEquals(
\str_replace(['<br/>', ' </b>', 'mention:'], ['<br>', '</b> ', 'tg://user?id='], $htmlReverse ?? $html),
StrTools::messageEntitiesToHtml(
StrTools::entitiesToHtml(
$resultMTProto['message'],
$resultMTProto['entities'],
true
@ -138,7 +138,7 @@ class EntitiesTest extends MadelineTestCase
],
[
'markdown',
'test** test**',
'test* test*',
'test test',
[
[
@ -220,7 +220,7 @@ class EntitiesTest extends MadelineTestCase
],
[
'markdown',
'test **bold *bold and italic* bold**',
'test *bold _bold and italic_ bold*',
'test bold bold and italic bold',
[
[
@ -235,6 +235,37 @@ class EntitiesTest extends MadelineTestCase
],
],
],
[
'markdown',
"a\nb\nc",
"a\nb\nc",
[],
],
[
'markdown',
"a\n\nb\n\nc",
"a\n\nb\n\nc",
[],
],
[
'markdown',
"a\n\n\nb\n\n\nc",
"a\n\n\nb\n\n\nc",
[],
],
[
'markdown',
"a\n```php\n<?php\necho 'yay';\n```",
"a\n<?php\necho 'yay';\n",
[
[
'offset' => 2,
'length' => 17,
'type' => 'pre',
'language' => 'php'
]
],
],
[
'html',
'<b>\'"</b>',
@ -269,20 +300,74 @@ class EntitiesTest extends MadelineTestCase
],
[
'markdown',
'_a b c &lt;b&gt; &amp; &quot; &#039;_',
'a b c <b> & " \'',
'_a b c <b> & " \' \_ \* \~ \\__',
'a b c <b> & " \' _ * ~ _',
[
[
'offset' => 0,
'length' => 15,
'length' => 23,
'type' => 'italic',
],
],
],
[
'markdown',
'test *italic* **bold** <u>underlined</u> ~~strikethrough~~ <pre language="test">pre</pre> <code>code</code> <spoiler>spoiler</spoiler>',
'test italic bold underlined strikethrough pre code spoiler',
'[link ](https://google.com/)test',
'link test',
[
[
'offset' => 0,
'length' => 4,
'type' => 'text_url',
'url' => 'https://google.com/'
],
],
],
[
'markdown',
'[link ](https://google.com/)',
'link ',
[
[
'offset' => 0,
'length' => 4,
'type' => 'text_url',
'url' => 'https://google.com/'
],
],
],
[
'html',
'<a href="https://google.com/">link </a>test',
'link test',
[
[
'offset' => 0,
'length' => 4,
'type' => 'text_url',
'url' => 'https://google.com/'
],
],
'<a href="https://google.com/">link</a> test',
],
[
'html',
'<a href="https://google.com/">link </a>',
'link ',
[
[
'offset' => 0,
'length' => 4,
'type' => 'text_url',
'url' => 'https://google.com/'
],
],
'<a href="https://google.com/">link</a> ',
],
[
'markdown',
'test _italic_ *bold* __underlined__ ~strikethrough~ ```test pre``` `code` ||spoiler||',
'test italic bold underlined strikethrough pre code spoiler',
[
[
'offset' => 5,
@ -306,17 +391,17 @@ class EntitiesTest extends MadelineTestCase
],
[
'offset' => 42,
'length' => 3,
'length' => 4,
'type' => 'pre',
'language' => 'test',
],
[
'offset' => 46,
'offset' => 47,
'length' => 4,
'type' => 'code',
],
[
'offset' => 51,
'offset' => 52,
'length' => 7,
'type' => 'spoiler',
],