-
Notifications
You must be signed in to change notification settings - Fork 4.5k
Expand file tree
/
Copy pathnaturallanguageml.py
More file actions
138 lines (121 loc) · 4.93 KB
/
naturallanguageml.py
File metadata and controls
138 lines (121 loc) · 4.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Mapping
from typing import Optional
from typing import Sequence
from typing import Tuple
from typing import Union
import apache_beam as beam
from apache_beam.metrics import Metrics
try:
from google.cloud import language
from google.cloud import language_v1
except ImportError:
raise ImportError(
'Google Cloud Natural Language API not supported for this execution '
'environment (could not import Natural Language API client).')
__all__ = ['Document', 'AnnotateText']
class Document(object):
"""Represents the input to :class:`AnnotateText` transform.
Args:
content (str): The content of the input or the Google Cloud Storage URI
where the file is stored.
type (`Union[str, google.cloud.language_v1.Document.Type]`): Text type.
Possible values are `HTML`, `PLAIN_TEXT`. The default value is
`PLAIN_TEXT`.
language_hint (`Optional[str]`): The language of the text. If not specified,
language will be automatically detected. Values should conform to
ISO-639-1 standard.
encoding (`Optional[str]`): Text encoding. Possible values are: `NONE`,
`UTF8`, `UTF16`, `UTF32`. The default value is `UTF8`.
from_gcs (bool): Whether the content should be interpret as a Google Cloud
Storage URI. The default value is :data:`False`.
"""
def __init__(
self,
content: str,
type: Union[str, language_v1.Document.Type] = 'PLAIN_TEXT',
language_hint: Optional[str] = None,
encoding: Optional[str] = 'UTF8',
from_gcs: bool = False):
self.content = content
self.type = type
self.encoding = encoding
self.language_hint = language_hint
self.from_gcs = from_gcs
@staticmethod
def to_dict(document: 'Document') -> Mapping[str, Optional[str]]:
if document.from_gcs:
dict_repr = {'gcs_content_uri': document.content}
else:
dict_repr = {'content': document.content}
dict_repr.update({
'type': document.type, 'language': document.language_hint
})
return dict_repr
@beam.ptransform_fn
def AnnotateText(
pcoll: beam.pvalue.PCollection,
features: Union[Mapping[str, bool],
language_v1.AnnotateTextRequest.Features],
timeout: Optional[float] = None,
metadata: Optional[Sequence[Tuple[str, str]]] = None):
"""A :class:`~apache_beam.transforms.ptransform.PTransform`
for annotating text using the Google Cloud Natural Language API:
https://cloud.google.com/natural-language/docs.
Args:
pcoll (:class:`~apache_beam.pvalue.PCollection`): An input PCollection of
:class:`Document` objects.
features (`Union[Mapping[str, bool], types.AnnotateTextRequest.Features]`):
A dictionary of natural language operations to be performed on given
text in the following format::
{'extact_syntax'=True, 'extract_entities'=True}
timeout (`Optional[float]`): The amount of time, in seconds, to wait
for the request to complete. The timeout applies to each individual
retry attempt.
metadata (`Optional[Sequence[Tuple[str, str]]]`): Additional metadata
that is provided to the method.
"""
return pcoll | beam.ParDo(_AnnotateTextFn(features, timeout, metadata))
@beam.typehints.with_input_types(Document)
@beam.typehints.with_output_types(language_v1.AnnotateTextResponse)
class _AnnotateTextFn(beam.DoFn):
def __init__(
self,
features: Union[Mapping[str, bool],
language_v1.AnnotateTextRequest.Features],
timeout: Optional[float],
metadata: Optional[Sequence[Tuple[str, str]]] = None):
self.features = features
self.timeout = timeout
self.metadata = metadata
self.api_calls = Metrics.counter(self.__class__.__name__, 'api_calls')
self.client = None
def setup(self):
self.client = self._get_api_client()
@staticmethod
def _get_api_client() -> language.LanguageServiceClient:
return language.LanguageServiceClient()
def process(self, element):
response = self.client.annotate_text(
document=Document.to_dict(element),
features=self.features,
encoding_type=element.encoding,
timeout=self.timeout,
metadata=self.metadata)
self.api_calls.inc()
yield response