Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Add adjust_message_entities_to_utf_16 utility #4323

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jul 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 78 additions & 1 deletion telegram/_messageentity.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@
# along with this program. If not, see [http://www.gnu.org/licenses/].
"""This module contains an object that represents a Telegram MessageEntity."""

from typing import TYPE_CHECKING, Final, List, Optional
import copy
import itertools
from typing import TYPE_CHECKING, Dict, Final, List, Optional, Sequence

from telegram import constants
from telegram._telegramobject import TelegramObject
Expand Down Expand Up @@ -140,6 +142,81 @@ def de_json(cls, data: Optional[JSONDict], bot: "Bot") -> Optional["MessageEntit

return super().de_json(data=data, bot=bot)

@staticmethod
def adjust_message_entities_to_utf_16(
text: str, entities: Sequence["MessageEntity"]
) -> Sequence["MessageEntity"]:
"""Utility functionality for converting the offset and length of entities from
Unicode (:obj:`str`) to UTF-16 (``utf-16-le`` encoded :obj:`bytes`).

Tip:
Only the offsets and lengths calulated in UTF-16 is acceptable by the Telegram Bot API.
If they are calculated using the Unicode string (:obj:`str` object), errors may occur
when the text contains characters that are not in the Basic Multilingual Plane (BMP).
For more information, see `Unicode <https://en.wikipedia.org/wiki/Unicode>`_ and
`Plane (Unicode) <https://en.wikipedia.org/wiki/Plane_(Unicode)>`_.

.. versionadded:: NEXT.VERSION

Examples:
Below is a snippet of code that demonstrates how to use this function to convert
entities from Unicode to UTF-16 space. The ``unicode_entities`` are calculated in
Unicode and the `utf_16_entities` are calculated in UTF-16.

.. code-block:: python

text = "𠌕 bold 𝄢 italic underlined: 𝛙𝌢𑁍"
unicode_entities = [
MessageEntity(offset=2, length=4, type=MessageEntity.BOLD),
MessageEntity(offset=9, length=6, type=MessageEntity.ITALIC),
MessageEntity(offset=28, length=3, type=MessageEntity.UNDERLINE),
]
utf_16_entities = MessageEntity.adjust_message_entities_to_utf_16(
text, unicode_entities
)
await bot.send_message(
chat_id=123,
text=text,
entities=utf_16_entities,
)
# utf_16_entities[0]: offset=3, length=4
# utf_16_entities[1]: offset=11, length=6
# utf_16_entities[2]: offset=30, length=6

Args:
text (:obj:`str`): The text that the entities belong to
entities (Sequence[:class:`telegram.MessageEntity`]): Sequence of entities
with offset and length calculated in Unicode

Returns:
Sequence[:class:`telegram.MessageEntity`]: Sequence of entities
with offset and length calculated in UTF-16 encoding
"""
# get sorted positions
positions = sorted(itertools.chain(*((x.offset, x.offset + x.length) for x in entities)))
accumulated_length = 0
# calculate the length of each slice text[:position] in utf-16 accordingly,
# store the position translations
position_translation: Dict[int, int] = {}
for i, position in enumerate(positions):
last_position = positions[i - 1] if i > 0 else 0
text_slice = text[last_position:position]
accumulated_length += len(text_slice.encode("utf-16-le")) // 2
position_translation[position] = accumulated_length
# get the final output entites
out = []
for entity in entities:
translated_positions = position_translation[entity.offset]
translated_length = (
position_translation[entity.offset + entity.length] - translated_positions
)
new_entity = copy.copy(entity)
with new_entity._unfrozen():
new_entity.offset = translated_positions
new_entity.length = translated_length
out.append(new_entity)
return out

ALL_TYPES: Final[List[str]] = list(constants.MessageEntityType)
"""List[:obj:`str`]: A list of all available message entity types."""
BLOCKQUOTE: Final[str] = constants.MessageEntityType.BLOCKQUOTE
Expand Down
22 changes: 22 additions & 0 deletions tests/test_messageentity.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
#
# You should have received a copy of the GNU Lesser Public License
# along with this program. If not, see [http://www.gnu.org/licenses/].
import random
from typing import List, Tuple

import pytest

from telegram import MessageEntity, User
Expand Down Expand Up @@ -81,6 +84,25 @@ def test_enum_init(self):
entity = MessageEntity(type="url", offset=0, length=1)
assert entity.type is MessageEntityType.URL

def test_fix_utf16(self):
text = "𠌕 bold 𝄢 italic underlined: 𝛙𝌢𑁍"
inputs_outputs: List[Tuple[Tuple[int, int, str], Tuple[int, int]]] = [
((2, 4, MessageEntity.BOLD), (3, 4)),
((9, 6, MessageEntity.ITALIC), (11, 6)),
((28, 3, MessageEntity.UNDERLINE), (30, 6)),
]
random.shuffle(inputs_outputs)
unicode_entities = [
MessageEntity(offset=_input[0], length=_input[1], type=_input[2])
for _input, _ in inputs_outputs
]
utf_16_entities = MessageEntity.adjust_message_entities_to_utf_16(text, unicode_entities)
for out_entity, input_output in zip(utf_16_entities, inputs_outputs):
_, output = input_output
offset, length = output
assert out_entity.offset == offset
assert out_entity.length == length

def test_equality(self):
a = MessageEntity(MessageEntity.BOLD, 2, 3)
b = MessageEntity(MessageEntity.BOLD, 2, 3)
Expand Down
Loading