Codestin Search App

History

1716 lines (1482 loc) · 68.6 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

572

573

574

575

576

577

578

579

580

581

582

583

584

585

586

587

588

589

590

591

592

593

594

595

596

597

598

599

600

601

602

603

604

605

606

607

608

609

610

611

612

613

614

615

616

617

618

619

620

621

622

623

624

625

626

627

628

629

630

631

632

633

634

635

636

637

638

639

640

641

642

643

644

645

646

647

648

649

650

651

652

653

654

655

656

657

658

659

660

661

662

663

664

665

666

667

668

669

670

671

672

673

674

675

676

677

678

679

680

681

682

683

684

685

686

687

688

689

690

691

692

693

694

695

696

697

698

699

700

701

702

703

704

705

706

707

708

709

710

711

712

713

714

715

716

717

718

719

720

721

722

723

724

725

726

727

728

729

730

731

732

733

734

735

736

737

738

739

740

741

742

743

744

745

746

747

748

749

750

751

752

753

754

755

756

757

758

759

760

761

762

763

764

765

766

767

768

769

770

771

772

773

774

775

776

777

778

779

780

781

782

783

784

785

786

787

788

789

790

791

792

793

794

795

796

797

798

799

800

801

802

803

804

805

806

807

808

809

810

811

812

813

814

815

816

817

818

819

820

821

822

823

824

825

826

827

828

829

830

831

832

833

834

835

836

837

838

839

840

841

842

843

844

845

846

847

848

849

850

851

852

853

854

855

856

857

858

859

860

861

862

863

864

865

866

867

868

869

870

871

872

873

874

875

876

877

878

879

880

881

882

883

884

885

886

887

888

889

890

891

892

893

894

895

896

897

898

899

900

901

902

903

904

905

906

907

908

909

910

911

912

913

914

915

916

917

918

919

920

921

922

923

924

925

926

927

928

929

930

931

932

933

934

935

936

937

938

939

940

941

942

943

944

945

946

947

948

949

950

951

952

953

954

955

956

957

958

959

960

961

962

963

964

965

966

967

968

969

970

971

972

973

974

975

976

977

978

979

980

981

982

983

984

985

986

987

988

989

990

991

992

993

994

995

996

997

998

999

1000

# coding=utf-8

# Licensed under the Apache License, Version 2.0 (the "License");

# you may not use this file except in compliance with the License.

# You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software

# distributed under the License is distributed on an "AS IS" BASIS,

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

# See the License for the specific language governing permissions and

# limitations under the License.

import bisect

import io

import itertools

import json

import re

import unicodedata

from collections import OrderedDict

from typing import Dict, List, Optional, Tuple, Union

import six

from paddle.utils import try_import

from paddlenlp.utils.log import logger

try:

from functools import lru_cache

except ImportError:

from backports.functools_lru_cache import lru_cache

from ..data.vocab import Vocab

from .tokenizer_utils_base import (

AddedToken,

BatchEncoding,

EncodedInput,

EncodedInputPair,

PaddingStrategy,

PreTokenizedInput,

PreTokenizedInputPair,

PretrainedTokenizerBase,

TensorType,

TextInput,

TextInputPair,

TruncationStrategy,

)

from .utils import InitTrackerMeta, fn_args_to_dict

__all__ = [

"PretrainedTokenizer",

"BPETokenizer",

"tokenize_chinese_chars",

"is_chinese_char",

"normalize_chars",

"tokenize_special_chars",

"convert_to_unicode",

]

def convert_to_unicode(text):

"""

Converts `text` to Unicode (if it's not already), assuming utf-8 input.

Args:

text (str|bytes): Text to be converted to unicode.

Returns:

str: converted text.

"""

if isinstance(text, str):

return text

elif isinstance(text, bytes):

return text.decode("utf-8", "ignore")

else:

raise ValueError("Unsupported string type: %s" % (type(text)))

def whitespace_tokenize(text):

"""

Runs basic whitespace cleaning and splitting on a peice of text.

Args:

text (str): Text to be tokenized.

Returns:

list(str): Token list.

"""

text = text.strip()

if not text:

return []

tokens = text.split()

return tokens

def _is_whitespace(char):

"""

Checks whether `chars` is a whitespace character.

"""

# \t, \n, and \r are technically contorl characters but we treat them

# as whitespace since they are generally considered as such.

if char == " " or char == "\t" or char == "\n" or char == "\r":

return True

cat = unicodedata.category(char)

if cat == "Zs":

return True

return False

def _is_control(char):

"""Checks whether `chars` is a control character."""

# These are technically control characters but we count them as whitespace

# characters.

if char == "\t" or char == "\n" or char == "\r":

return False

cat = unicodedata.category(char)

if cat.startswith("C"):

return True

return False

def _is_punctuation(char):

"""Checks whether `chars` is a punctuation character."""

cp = ord(char)

# We treat all non-letter/number ASCII as punctuation.

# Characters such as "^", "$", and "`" are not in the Unicode

# Punctuation class but we treat them as punctuation anyways, for

# consistency.

if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):

return True

cat = unicodedata.category(char)

if cat.startswith("P"):

return True

return False

def _is_end_of_word(text):

"""Checks whether the last character in text is one of a punctuation, control or whitespace character."""

last_char = text[-1]

return bool(_is_control(last_char) | _is_punctuation(last_char) | _is_whitespace(last_char))

def _is_start_of_word(text):

"""Checks whether the first character in text is one of a punctuation, control or whitespace character."""

first_char = text[0]

return bool(_is_control(first_char) | _is_punctuation(first_char) | _is_whitespace(first_char))

def _insert_one_token_to_ordered_list(token_list: List[str], new_token: str):

"""

Inserts one token to an ordered list if it does not already exist. Note: token_list must be sorted.

"""

insertion_idx = bisect.bisect_left(token_list, new_token)

# Checks if new_token is already in the ordered token_list

if insertion_idx < len(token_list) and token_list[insertion_idx] == new_token:

# new_token is in token_list, don't add

return

else:

token_list.insert(insertion_idx, new_token)

def is_chinese_char(cp):

"""Checks whether CP is the codepoint of a CJK character."""

# This defines a "chinese character" as anything in the CJK Unicode block:

# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)

# Note that the CJK Unicode block is NOT all Japanese and Korean characters,

# despite its name. The modern Korean Hangul alphabet is a different block,

# as is Japanese Hiragana and Katakana. Those alphabets are used to write

# space-separated words, so they are not treated specially and handled

# like the all of the other languages.

if (

(cp >= 0x4E00 and cp <= 0x9FFF)

or (cp >= 0x3400 and cp <= 0x4DBF) #

or (cp >= 0x20000 and cp <= 0x2A6DF) #

or (cp >= 0x2A700 and cp <= 0x2B73F) #

or (cp >= 0x2B740 and cp <= 0x2B81F) #

or (cp >= 0x2B820 and cp <= 0x2CEAF) #

or (cp >= 0xF900 and cp <= 0xFAFF)

or (cp >= 0x2F800 and cp <= 0x2FA1F) #

): #

return True

return False

def _is_nonnormalized_char(char):

"""Check whther `chars` is a non-normalized character."""

cp = ord(char)

if (

(0xFF00 <= cp <= 0xFFEF)

or (0xFE50 <= cp <= 0xFE6B) # Halfwidth and Fullwidth Forms

or (0x3358 <= cp <= 0x33FF) # Small Form Variants

or (0x249C <= cp <= 0x24E9) # CJK Compatibility

or (0x3200 <= cp <= 0x32FF) # Enclosed Alphanumerics: Ⓛ ⒰

): # Enclosed CJK Letters and Months

return True

return False

def _is_nonnormalized_numeric(char):

"""Check whether `chars` is a non-normalized numeric character."""

cp = ord(char)

if (

(0x2460 <= cp <= 0x249B)

or (0x24EA <= cp <= 0x24FF) #

or (0x2776 <= cp <= 0x2793) #

or (0x2160 <= cp <= 0x217F) # Enclosed Alphanumerics

): # Number Forms

return True

return False

def normalize_chars(text):

"""

Normalize the text for multiligual and chinese models. Unicode range:

https://www.ling.upenn.edu/courses/Spring_2003/ling538/UnicodeRanges.html

"""

output = []

for char in text:

if _is_nonnormalized_char(char):

for c in unicodedata.normalize("NFKC", char):

output.append(c)

elif _is_nonnormalized_numeric(char):

output.append(" ")

for c in str(int(unicodedata.numeric(char))):

output.append(c)

output.append(" ")

elif ord(char) == 0xF979: # https://www.zhihu.com/question/20697984

output.append("凉")

else:

output.append(char)

return "".join(output)

def _is_symbol(char):

"""Check whether CP is the codepoint of a Symbol character."""

cp = ord(char)

if unicodedata.category(char).startswith("S") or (

cp in [0x00AD, 0x00B2, 0x00BA, 0x3007, 0x00B5, 0x00D8, 0x014B, 0x01B1]

return True

return False

def tokenize_special_chars(text):

"""Adds whitespace around any special character."""

output = []

for char in text:

cp = ord(char)

if (

(0x3040 <= cp <= 0x30FF)

or (0x0370 <= cp <= 0x04FF) # Japanese

or (0x0250 <= cp <= 0x02AF) # Greek/Coptic & Cyrillic

or _is_symbol(char) # IPA

output.append(" ")

output.append(char)

output.append(" ")

else:

output.append(char)

return "".join(output)

class Trie:

"""

Trie in Python. Creates a Trie out of a list of words. The trie is used to split on `added_tokens` in one pass

Loose reference https://en.wikipedia.org/wiki/Trie

"""

def __init__(self):

self.data = {}

def add(self, word: str):

"""

Passes over every char (utf-8 char) on word and recursively adds it to the internal `data` trie representation.

The special key `""` is used to represent termination.

This function is idempotent, adding twice the same word will leave the trie unchanged

Example:

```python

>>> trie = Trie()

>>> trie.add("Hello 友達")

>>> trie.data

{"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}}

>>> trie.add("Hello")

>>> trie.data

{"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}}

```

"""

if not word:

# Prevent empty string

return

ref = self.data

for char in word:

ref[char] = char in ref and ref[char] or {}

ref = ref[char]

ref[""] = 1

def split(self, text: str) -> List[str]:

"""

Will look for the words added to the trie within `text`. Output is the original string splitted along the

boundaries of the words found.

This trie will match the longest possible word first !

Example:

```python

>>> trie = Trie()

>>> trie.split("[CLS] This is a extra_id_100")

["[CLS] This is a extra_id_100"]

>>> trie.add("[CLS]")

>>> trie.add("extra_id_1")

>>> trie.add("extra_id_100")

>>> trie.split("[CLS] This is a extra_id_100")

["[CLS]", " This is a ", "extra_id_100"]

```

"""

# indexes are counted left of the chars index.

# "hello", index 0, is left of h, index 1 is between h and e.

# index 5 is right of the "o".

# States are going to capture every possible start (indexes as above)

# as keys, and have as values, a pointer to the position in the trie

# where we're at. This is a partial match for now.

# This enables to keep track of multiple matches while we're iterating

# the string

# If the trie contains, "blowing", and "lower" and we encounter the

# string "blower", we need to split into ["b", "lower"].

# This is where we need to keep track of multiple possible starts.

states = OrderedDict()

# This will contain every indices where we need

# to cut.

# We force to cut at offset 0 and len(text) (added later)

offsets = [0]

# This is used by the lookahead which needs to skip over

# some text where the full match exceeded the place in the initial

# for loop

skip = 0

# Main loop, Giving this algorithm O(n) complexity

for current, current_char in enumerate(text):

if skip and current < skip:

# Prevents the lookahead for matching twice

# like extra_id_100 and id_100

continue

# This will track every state

# that stop matching, we need to stop tracking them.

# If we look at "lowball", we're going to match "l" (add it to states), "o", "w", then

# fail on "b", we need to remove 0 from the valid states.

to_remove = set()

# Whenever we found a match, we need to drop everything

# this is a greedy algorithm, it will match on the first found token

reset = False

# In this case, we already have partial matches (But unfinished)

for start, trie_pointer in states.items():

if "" in trie_pointer:

# This is a final match, we need to reset and

# store the results in `offsets`.

# Lookahead to match longest first

# Important in case of extra_id_1 vs extra_id_100

# Here we are also actively looking for other earlier partial

# matches

# "[CLS]", "L", we need to match CLS even if L is special

for lookstart, looktrie_pointer in states.items():

if lookstart > start:

# This partial match is later, we can stop looking

break

elif lookstart < start:

# This partial match is earlier, the trie pointer

# was already updated, so index is + 1

lookahead_index = current + 1

end = current + 1

else:

# Here lookstart == start and

# looktrie_pointer == trie_pointer

# It wasn't updated yet so indices are current ones

lookahead_index = current

end = current

next_char = text[lookahead_index] if lookahead_index < len(text) else None

if "" in looktrie_pointer:

start = lookstart

end = lookahead_index

skip = lookahead_index

while next_char in looktrie_pointer:

looktrie_pointer = looktrie_pointer[next_char]

lookahead_index += 1

if "" in looktrie_pointer:

start = lookstart

end = lookahead_index

skip = lookahead_index

if lookahead_index == len(text):

# End of string

break

next_char = text[lookahead_index]

# End lookahead

# Storing and resetting

offsets.append(start)

offsets.append(end)

reset = True

break

elif current_char in trie_pointer:

# The current character being looked at has a match within the trie

# update the pointer (it will be stored back into states later).

trie_pointer = trie_pointer[current_char]

# Storing back the new pointer into the states.

# Partial matches got longer by one.

states[start] = trie_pointer

else:

# The new character has not match in the trie, we need

# to stop keeping track of this partial match.

# We can't do it directly within the loop because of how

# python iteration works

to_remove.add(start)

# Either clearing the full start (we found a real match)

# Or clearing only the partial matches that didn't work.

if reset:

states = {}

else:

for start in to_remove:

del states[start]

# If this character is a starting character within the trie

# start keeping track of this partial match.

if current >= skip and current_char in self.data:

states[current] = self.data[current_char]

# We have a cut at the end with states.

for start, trie_pointer in states.items():

if "" in trie_pointer:

# This is a final match, we need to reset and

# store the results in `offsets`.

end = len(text)

offsets.append(start)

offsets.append(end)

# Longest cut is always the one with lower start so the first

# item so we need to break.

break

return self.cut_text(text, offsets)

def cut_text(self, text, offsets):

# We have all the offsets now, we just need to do the actual splitting.

# We need to eventually add the first part of the string and the eventual

# last part.

offsets.append(len(text))

tokens = []

start = 0

for end in offsets:

if start > end:

logger.error(

"There was a bug in Trie algorithm in tokenization. Attempting to recover. Please report it anyway."

)

continue

elif start == end:

# This might happen if there's a match at index 0

# we're also preventing zero-width cuts in case of two

# consecutive matches

continue

tokens.append(text[start:end])

start = end

return tokens

def tokenize_chinese_chars(text):

"""Adds whitespace around any CJK character."""

output = []

buff = ""

for char in text:

cp = ord(char)

if is_chinese_char(cp):

if buff != "":

output.append(buff)

buff = ""

output.append(char)

else:

buff += char

if buff != "":

output.append(buff)

return output

@six.add_metaclass(InitTrackerMeta)

class PretrainedTokenizer(PretrainedTokenizerBase):

"""

Base class for all tokenizers.

Inherits from [`~tokenizer_utils_base.PretrainedTokenizerBase`].

Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading

pretrained tokenizers as well as adding tokens to the vocabulary.

This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the

specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).

- **resource_files_names** (`Dict[str, str]`) -- A dictionary with, as keys, the `__init__` keyword name of each

vocabulary file required by the model, and as associated values, the filename for saving the associated file

(string).

- **pretrained_resource_files_map** (`Dict[str, Dict[str, str]]`) -- A dictionary of dictionaries, with the

high-level keys being the `__init__` keyword name of each vocabulary file required by the model, the

low-level being the `short-cut-names` of the pretrained models with, as associated values, the `url` to the

associated pretrained vocabulary file.

- **max_model_input_sizes** (`Dict[str, Optional[int]]`) -- A dictionary with, as keys, the `short-cut-names`

of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model,

or `None` if the model has no maximum input size.

- **pretrained_init_configuration** (`Dict[str, Dict[str, Any]]`) -- A dictionary with, as keys, the

`short-cut-names` of the pretrained models, and as associated values, a dictionary of specific arguments to

pass to the `__init__` method of the tokenizer class for this pretrained model when loading the tokenizer

with the [`~tokenizer_utils_base.PretrainedTokenizerBase.from_pretrained`] method.

- **model_input_names** (`List[str]`) -- A list of inputs expected in the forward pass of the model.

- **padding_side** (`str`) -- The default value for the side on which the model should have padding applied.

Should be `'right'` or `'left'`.

- **truncation_side** (`str`) -- The default value for the side on which the model should have truncation

applied. Should be `'right'` or `'left'`.

Moreover, methods common to tokenizers for tokenization, token/id conversion

and encoding as model inputs are also provided here.

Besides, metaclass `InitTrackerMeta` is used to create `PretrainedTokenizer`,

by which subclasses can track arguments for initialization automatically

and expose special tokens initialization used as attributes.

"""

added_tokens_encoder: Dict[str, int] = {}

added_tokens_decoder: Dict[int, str] = {}

unique_no_split_tokens: List[str] = []

tokens_trie = Trie()

_decode_use_source_tokenizer = False

def _pre_init(self, original_init, *args, **kwargs):

"""

It would be hooked before `__init__` to add specials tokens (arguments of

`__init__` whose name ends with `_token`) as attributes of the tokenizer

instance.

"""

init_dict = fn_args_to_dict(original_init, *((self,) + args), **kwargs)

init_dict.pop("self", None)

super(PretrainedTokenizer, self).__init__(**init_dict)

self.added_tokens_encoder: Dict[str, int] = {}

self.added_tokens_decoder: Dict[int, str] = {}

self.unique_no_split_tokens: List[str] = []

self.tokens_trie = Trie()

self._decode_use_source_tokenizer = False

def _build_special_tokens_map_extended(self, **kwargs):

for key, value in kwargs.items():

if value is None:

continue

if key in self.SPECIAL_TOKENS_ATTRIBUTES:

if key == "additional_special_tokens":

assert isinstance(value, (list, tuple)), f"Value {value} is not a list or tuple"

assert all(

isinstance(t, (str, AddedToken)) for t in value

), "One of the tokens is not a string or an AddedToken"

setattr(self, key, value)

elif isinstance(value, (str, AddedToken)):

setattr(self, key, value)

else:

raise TypeError(f"special token {key} has to be either str or AddedToken but got: {type(value)}")

@property

def vocab_size(self) -> int:

"""

`int`: Size of the base vocabulary (without the added tokens).

"""

raise NotImplementedError

@property

def is_fast(self) -> bool:

return False

def get_added_vocab(self) -> Dict[str, int]:

"""

Returns the added tokens in the vocabulary as a dictionary of token to index.

Returns:

`Dict[str, int]`: The added tokens.

"""

return self.added_tokens_encoder

def __len__(self):

"""

Size of the full vocabulary with the added tokens.

"""

return self.vocab_size + len(self.added_tokens_encoder)

def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:

"""

Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to

it with indices starting from length of the current vocabulary.

Args:

new_tokens (`List[str]`or `List[AddedToken]`):

Token(s) to add in vocabulary. A token is only added if it's not already in the vocabulary (tested by

checking if the tokenizer assign the index of the `unk_token` to them).

special_tokens (`bool`, *optional*, defaults to `False`):

Whether or not the tokens should be added as special tokens.

Returns:

`int`: The number of tokens actually added to the vocabulary.

Examples:

```python

# Let's see how to increase the vocabulary of Bert model and tokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

model = BertModel.from_pretrained("bert-base-uncased")

num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])

print("We have added", num_added_toks, "tokens")

```"""

new_tokens = [str(tok) for tok in new_tokens]

tokens_to_add = []

for token in new_tokens:

if not isinstance(token, str):

raise TypeError(f"Token {token} is not a string but a {type(token)}.")

if not special_tokens and hasattr(self, "do_lower_case") and self.do_lower_case:

token = token.lower()

if (

token != self.unk_token

and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token)

and token not in tokens_to_add

tokens_to_add.append(token)

if self.verbose:

logger.info(f"Adding {token} to the vocabulary")

added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(tokens_to_add))

added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}

self.added_tokens_encoder.update(added_tok_encoder)

self.added_tokens_decoder.update(added_tok_decoder)

# Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert)

if special_tokens:

if len(new_tokens) == 1:

_insert_one_token_to_ordered_list(self.unique_no_split_tokens, new_tokens[0])

else:

self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(new_tokens)))

else:

# Or on the newly added tokens

if len(tokens_to_add) == 1:

_insert_one_token_to_ordered_list(self.unique_no_split_tokens, tokens_to_add[0])

else:

self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(tokens_to_add)))

self._create_trie(self.unique_no_split_tokens)

return len(tokens_to_add)

def _create_trie(self, unique_no_split_tokens):

trie = Trie()

for token in unique_no_split_tokens:

if hasattr(self, "do_lower_case") and self.do_lower_case and token not in self.all_special_tokens:

trie.add(token.lower())

else:

trie.add(token)

self.tokens_trie = trie

def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):

"""

Performs any necessary transformations before tokenization.

This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the

`kwargs` at the end of the encoding process to be sure all the arguments have been used.

Args:

text (`str`):

The text to prepare.

is_split_into_words (`bool`, *optional*, defaults to `False`):

Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the

tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)

which it will tokenize. This is useful for NER or token classification.

kwargs:

Keyword arguments to use for the tokenization.

Returns:

`Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.

"""

return (text, kwargs)

def tokenize(self, text: TextInput, **kwargs) -> List[str]:

"""

Converts a string in a sequence of tokens, using the tokenizer.

Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies

(BPE/SentencePieces/WordPieces). Takes care of added tokens.

Args:

text (`str`):

The sequence to be encoded.

**kwargs (additional keyword arguments):

Passed along to the model-specific `prepare_for_tokenization` preprocessing method.

Returns:

`List[str]`: The list of tokens.

"""

# Simple mapping string => AddedToken for special tokens with specific tokenization behaviors

all_special_tokens_extended = dict(

(str(t), t) for t in self.all_special_tokens_extended if isinstance(t, AddedToken)

)

text, kwargs = self.prepare_for_tokenization(text, **kwargs)

# TODO: should this be in the base class?

if hasattr(self, "do_lower_case") and self.do_lower_case:

# convert non-special tokens to lowercase

escaped_special_toks = [

re.escape(s_tok) for s_tok in (self.unique_no_split_tokens + self.all_special_tokens)

]

pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"

text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text)

no_split_token = set(self.unique_no_split_tokens)

tokens = self.tokens_trie.split(text)

# ["This is something", "<special_token_1>", " else"]

for i, token in enumerate(tokens):

if token in no_split_token:

tok_extended = all_special_tokens_extended.get(token, None)

left = tokens[i - 1] if i > 0 else None

right = tokens[i + 1] if i < len(tokens) - 1 else None

if isinstance(tok_extended, AddedToken):

if tok_extended.rstrip and right:

# A bit counter-intuitive but we strip the left of the string

# since tok_extended.rstrip means the special token is eating all white spaces on its right

tokens[i + 1] = right.lstrip()

# Strip white spaces on the left

if tok_extended.lstrip and left:

tokens[i - 1] = left.rstrip() # Opposite here

else:

# We strip left and right by default

if right:

tokens[i + 1] = right.lstrip()

if left:

tokens[i - 1] = left.rstrip()

# ["This is something", "<special_token_1>", "else"]

tokenized_text = []

for token in tokens:

# Need to skip eventual empty (fully stripped) tokens

if not token:

continue

if token in no_split_token:

tokenized_text.append(token)

else:

tokenized_text.extend(self._tokenize(token))

# ["This", " is", " something", "<special_token_1>", "else"]

return tokenized_text

def _tokenize(self, text, **kwargs):

"""

Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based

vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).

Do NOT take care of added tokens.

"""

raise NotImplementedError

def convert_tokens_to_ids(self, tokens):

if tokens is None:

return None

if isinstance(tokens, str):

return self._convert_token_to_id_with_added_voc(tokens)

ids = []

for token in tokens:

ids.append(self._convert_token_to_id_with_added_voc(token))

return ids

def _convert_token_to_id_with_added_voc(self, token):

if token is None:

return None

if token in self.added_tokens_encoder:

return self.added_tokens_encoder[token]

return self._convert_token_to_id(token)

def _convert_token_to_id(self, token):

return self.vocab.to_indices(token)

def convert_tokens_to_string(self, tokens):

"""

Converts a sequence of tokens (list of string) to a single string by

using ``' '.join(tokens)`` .

Args:

tokens (list[str]): A sequence of tokens.

Returns:

str: Converted string.

"""

return " ".join(tokens)

def convert_ids_to_tokens(self, ids, skip_special_tokens=False):

if isinstance(ids, int):

if ids in self.added_tokens_decoder:

return self.added_tokens_decoder[ids]

else:

return self._convert_id_to_token(ids)

tokens = []

for index in ids:

index = int(index)

if skip_special_tokens and index in self.all_special_ids:

continue

if index in self.added_tokens_decoder:

tokens.append(self.added_tokens_decoder[index])

else:

tokens.append(self._convert_id_to_token(index))

return tokens

def _convert_id_to_token(self, index):

return self.vocab.to_tokens(index)

@staticmethod

def load_vocabulary(filepath, unk_token=None, pad_token=None, bos_token=None, eos_token=None, **kwargs):

"""

Instantiate an instance of `Vocab` from a file reserving all tokens

by using `Vocab.from_dict`. The file contains a token per line, and the

line number would be the index of corresponding token.

Args:

filepath (str): path of file to construct vocabulary.

unk_token (str): special token for unknown token. If no need, it also

could be `None`. Defaults to `None`.

pad_token (str): special token for padding token. If no need, it also

could be `None`. Defaults to `None`.

bos_token (str): special token for bos token. If no need, it also

could be `None`. Defaults to `None`.

eos_token (str): special token for eos token. If no need, it also

could be `None`. Defaults to `None`.

**kwargs (dict): keyword arguments for `Vocab.from_dict`.

Returns:

Vocab: An instance of `Vocab`.

"""

token_to_idx = {}

with io.open(filepath, "r", encoding="utf-8") as f:

for index, line in enumerate(f):

token = line.rstrip("\n")

token_to_idx[token] = int(index)

vocab = Vocab.from_dict(

token_to_idx, unk_token=unk_token, pad_token=pad_token, bos_token=bos_token, eos_token=eos_token, **kwargs

)

return vocab

@staticmethod

def save_vocabulary(filepath, vocab):

"""

Save all tokens to a vocabulary file. The file contains a token per line,

and the line number would be the index of corresponding token.

Args:

filepath (str): File path to be saved to.

vocab (Vocab|dict): The `Vocab` or `dict` instance to be saved.

"""

if isinstance(vocab, Vocab):

tokens = vocab.idx_to_token

else:

tokens = sorted(vocab.keys(), key=lambda token: vocab[token])

with io.open(filepath, "w", encoding="utf-8") as f:

for token in tokens:

f.write(token + "\n")

def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):

"""

Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding

special tokens using the tokenizer ``encode`` methods.

Args:

token_ids_0 (List[int]): List of ids of the first sequence.

token_ids_1 (List[int], optional): List of ids of the second sequence.

already_has_special_tokens (bool, optional): Whether or not the token list is already

formatted with special tokens for the model. Defaults to None.

Returns:

results (List[int]): The list of integers in the range [0, 1]:

1 for a special token, 0 for a sequence token.

"""

if already_has_special_tokens:

if token_ids_1 is not None:

raise ValueError(

"You should not supply a second sequence if the provided sequence of "

"ids is already formatted with special tokens for the model."

)

return super().get_special_tokens_mask(

token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True

)

return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))

def num_special_tokens_to_add(self, pair):

"""

Returns the number of added tokens when encoding a sequence with special tokens.

Args:

pair (bool, optional):

Whether the number of added tokens should be computed in the case of a sequence pair or a single

sequence. Defaults to `False`.

Returns:

int: Number of special tokens added to sequences.

"""

token_ids_0 = []

token_ids_1 = []

return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))

def _encode_plus(

self,

text: Union[TextInput, PreTokenizedInput, EncodedInput],

text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,

add_special_tokens: bool = True,

padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,

truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,

max_length: Optional[int] = None,

stride: int = 0,

is_split_into_words: bool = False,

pad_to_multiple_of: Optional[int] = None,

return_tensors: Optional[Union[str, TensorType]] = None,

return_position_ids: Optional[bool] = None,

return_token_type_ids: Optional[bool] = None,

return_attention_mask: Optional[bool] = None,

return_overflowing_tokens: bool = False,

return_special_tokens_mask: bool = False,

return_offsets_mapping: bool = False,

return_length: bool = False,

verbose: bool = True,

**kwargs

) -> BatchEncoding:

def get_input_ids(text):

if isinstance(text, str):

tokens = self.tokenize(text, **kwargs)

return self.convert_tokens_to_ids(tokens)

elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):

if is_split_into_words:

tokens = list(

itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))

)

return self.convert_tokens_to_ids(tokens)

else:

return self.convert_tokens_to_ids(text)

elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):

return text

else:

if is_split_into_words:

raise ValueError(

f"Input {text} is not valid. Should be a string or a list/tuple of strings when `is_split_into_words=True`."

)

else:

raise ValueError(

f"Input {text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."

)

first_ids = get_input_ids(text)

second_ids = get_input_ids(text_pair) if text_pair is not None else None

if return_offsets_mapping:

kwargs["text"] = text

kwargs["text_pair"] = text_pair

return self.prepare_for_model(

first_ids,

pair_ids=second_ids,

add_special_tokens=add_special_tokens,

padding=padding_strategy.value,

truncation=truncation_strategy.value,

max_length=max_length,

stride=stride,

pad_to_multiple_of=pad_to_multiple_of,

return_tensors=return_tensors,

prepend_batch_axis=True,

return_position_ids=return_position_ids,

return_attention_mask=return_attention_mask,

return_token_type_ids=return_token_type_ids,

return_overflowing_tokens=return_overflowing_tokens,

return_special_tokens_mask=return_special_tokens_mask,

View remainder of file in raw view

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

tokenizer_utils.py

Latest commit

History

tokenizer_utils.py

File metadata and controls