添加代码

2019-01-29 18:31:51 +08:00 · 2019-01-29 18:31:51 +08:00 · 7ab44a43a6
commit 7ab44a43a6
13 changed files with 105616 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,16 @@
+# bert_feature
+
+how to use Bert generate the sentence vector
+
+1、download the model 
+
+model path: https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip
+
+2、Move the model in the same directory
+
+3、init BertVector object and invokes the encode method, the param must be list
+```
+from bert.extrac_feature import BertVector
+bv = BertVector()
+bv.encode(['你好'])
+```
--- a/init.py
+++ b/init.py
@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/args.py
+++ b/args.py
@ -0,0 +1,31 @@
+import os
+from enum import Enum
+
+file_path = os.path.dirname(__file__)
+
+model_dir = os.path.join(file_path, 'chinese_L-12_H-768_A-12/')
+config_name = os.path.join(model_dir, 'bert_config.json')
+ckpt_name = os.path.join(model_dir, 'bert_model.ckpt')
+
+output_dir = os.path.join(model_dir, '../tmp/result/')
+
+vocab_file = os.path.join(model_dir, 'vocab.txt')
+data_dir = os.path.join(model_dir, '../data/')
+
+max_seq_len = 32
+
+layer_indexes = [-2, -3, -4]
+
+batch_size = 128
+
+gpu_memory_fraction = 0.8
+
+learning_rate = 0.00005
+
+num_train_epochs = 10
+
+use_gpu = False
+if use_gpu:
+    device_id = '0'
+else:
+    device_id = '-1'
--- a/data/test.csv
+++ b/data/test.csv
--- a/data/train.csv
+++ b/data/train.csv
--- a/extract_feature.py
+++ b/extract_feature.py
@ -0,0 +1,339 @@
+from graph import import_tf
+import modeling
+import tokenization
+from graph import optimize_graph
+import args
+from queue import Queue
+from threading import Thread
+
+tf = import_tf(0, True)
+
+
+class InputExample(object):
+
+    def __init__(self, unique_id, text_a, text_b):
+        self.unique_id = unique_id
+        self.text_a = text_a
+        self.text_b = text_b
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
+        self.unique_id = unique_id
+        self.tokens = tokens
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.input_type_ids = input_type_ids
+
+
+class BertVector:
+
+    def __init__(self, batch_size=32):
+        """
+        init BertVector
+        :param batch_size:     Depending on your memory default is 32
+        """
+        self.max_seq_length = args.max_seq_len
+        self.layer_indexes = args.layer_indexes
+        self.gpu_memory_fraction = 1
+        self.graph_path = optimize_graph()
+        self.tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=True)
+        self.batch_size = batch_size
+        self.estimator = self.get_estimator()
+        self.input_queue = Queue(maxsize=1)
+        self.output_queue = Queue(maxsize=1)
+        self.predict_thread = Thread(target=self.predict_from_queue, daemon=True)
+        self.predict_thread.start()
+
+    def get_estimator(self):
+        from tensorflow.python.estimator.estimator import Estimator
+        from tensorflow.python.estimator.run_config import RunConfig
+        from tensorflow.python.estimator.model_fn import EstimatorSpec
+
+        def model_fn(features, labels, mode, params):
+            with tf.gfile.GFile(self.graph_path, 'rb') as f:
+                graph_def = tf.GraphDef()
+                graph_def.ParseFromString(f.read())
+
+            input_names = ['input_ids', 'input_mask', 'input_type_ids']
+
+            output = tf.import_graph_def(graph_def,
+                                         input_map={k + ':0': features[k] for k in input_names},
+                                         return_elements=['final_encodes:0'])
+
+            return EstimatorSpec(mode=mode, predictions={
+                'encodes': output[0]
+            })
+
+        config = tf.ConfigProto()
+        config.gpu_options.allow_growth = True
+        config.gpu_options.per_process_gpu_memory_fraction = self.gpu_memory_fraction
+        config.log_device_placement = False
+        config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
+
+        return Estimator(model_fn=model_fn, config=RunConfig(session_config=config),
+                         params={'batch_size': self.batch_size})
+
+    def predict_from_queue(self):
+        prediction = self.estimator.predict(input_fn=self.queue_predict_input_fn, yield_single_examples=False)
+        for i in prediction:
+            self.output_queue.put(i)
+
+    def encode(self, sentence):
+        self.input_queue.put(sentence)
+        prediction = self.output_queue.get()
+        return prediction
+
+    def queue_predict_input_fn(self):
+
+        return (tf.data.Dataset.from_generator(
+            self.generate_from_queue,
+            output_types={'unique_ids': tf.int32,
+                          'input_ids': tf.int32,
+                          'input_mask': tf.int32,
+                          'input_type_ids': tf.int32},
+            output_shapes={
+                'unique_ids': (1,),
+                'input_ids': (None, self.max_seq_length),
+                'input_mask': (None, self.max_seq_length),
+                'input_type_ids': (None, self.max_seq_length)}))
+
+    def generate_from_queue(self):
+        while True:
+            features = list(self.convert_examples_to_features(seq_length=self.max_seq_length, tokenizer=self.tokenizer))
+            yield {
+                'unique_ids': [f.unique_id for f in features],
+                'input_ids': [f.input_ids for f in features],
+                'input_mask': [f.input_mask for f in features],
+                'input_type_ids': [f.input_type_ids for f in features]
+            }
+
+    def input_fn_builder(self, features, seq_length):
+        """Creates an `input_fn` closure to be passed to Estimator."""
+
+        all_unique_ids = []
+        all_input_ids = []
+        all_input_mask = []
+        all_input_type_ids = []
+
+        for feature in features:
+            all_unique_ids.append(feature.unique_id)
+            all_input_ids.append(feature.input_ids)
+            all_input_mask.append(feature.input_mask)
+            all_input_type_ids.append(feature.input_type_ids)
+
+        def input_fn(params):
+            """The actual input function."""
+            batch_size = params["batch_size"]
+
+            num_examples = len(features)
+
+            # This is for demo purposes and does NOT scale to large data sets. We do
+            # not use Dataset.from_generator() because that uses tf.py_func which is
+            # not TPU compatible. The right way to load data is with TFRecordReader.
+            d = tf.data.Dataset.from_tensor_slices({
+                "unique_ids":
+                    tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32),
+                "input_ids":
+                    tf.constant(
+                        all_input_ids, shape=[num_examples, seq_length],
+                        dtype=tf.int32),
+                "input_mask":
+                    tf.constant(
+                        all_input_mask,
+                        shape=[num_examples, seq_length],
+                        dtype=tf.int32),
+                "input_type_ids":
+                    tf.constant(
+                        all_input_type_ids,
+                        shape=[num_examples, seq_length],
+                        dtype=tf.int32),
+            })
+
+            d = d.batch(batch_size=batch_size, drop_remainder=False)
+            return d
+
+        return input_fn
+
+    def model_fn_builder(self, bert_config, init_checkpoint, layer_indexes):
+        """Returns `model_fn` closure for TPUEstimator."""
+
+        def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
+            """The `model_fn` for TPUEstimator."""
+
+            unique_ids = features["unique_ids"]
+            input_ids = features["input_ids"]
+            input_mask = features["input_mask"]
+            input_type_ids = features["input_type_ids"]
+
+            jit_scope = tf.contrib.compiler.jit.experimental_jit_scope
+
+            with jit_scope():
+                model = modeling.BertModel(
+                    config=bert_config,
+                    is_training=False,
+                    input_ids=input_ids,
+                    input_mask=input_mask,
+                    token_type_ids=input_type_ids)
+
+                if mode != tf.estimator.ModeKeys.PREDICT:
+                    raise ValueError("Only PREDICT modes are supported: %s" % (mode))
+
+                tvars = tf.trainable_variables()
+
+                (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars,
+                                                                                                           init_checkpoint)
+
+                tf.logging.info("**** Trainable Variables ****")
+                for var in tvars:
+                    init_string = ""
+                    if var.name in initialized_variable_names:
+                        init_string = ", *INIT_FROM_CKPT*"
+                    tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
+                                    init_string)
+
+                all_layers = model.get_all_encoder_layers()
+
+                predictions = {
+                    "unique_id": unique_ids,
+                }
+
+                for (i, layer_index) in enumerate(layer_indexes):
+                    predictions["layer_output_%d" % i] = all_layers[layer_index]
+
+                from tensorflow.python.estimator.model_fn import EstimatorSpec
+
+                output_spec = EstimatorSpec(mode=mode, predictions=predictions)
+                return output_spec
+
+        return model_fn
+
+    def convert_examples_to_features(self, seq_length, tokenizer):
+        """Loads a data file into a list of `InputBatch`s."""
+
+        features = []
+        input_masks = []
+        examples = self._to_example(self.input_queue.get())
+        for (ex_index, example) in enumerate(examples):
+            tokens_a = tokenizer.tokenize(example.text_a)
+
+            # if the sentences's length is more than seq_length, only use sentence's left part
+            if len(tokens_a) > seq_length - 2:
+                tokens_a = tokens_a[0:(seq_length - 2)]
+
+            # The convention in BERT is:
+            # (a) For sequence pairs:
+            #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+            #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
+            # (b) For single sequences:
+            #  tokens:   [CLS] the dog is hairy . [SEP]
+            #  type_ids: 0     0   0   0  0     0 0
+            #
+            # Where "type_ids" are used to indicate whether this is the first
+            # sequence or the second sequence. The embedding vectors for `type=0` and
+            # `type=1` were learned during pre-training and are added to the wordpiece
+            # embedding vector (and position vector). This is not *strictly* necessary
+            # since the [SEP] token unambiguously separates the sequences, but it makes
+            # it easier for the model to learn the concept of sequences.
+            #
+            # For classification tasks, the first vector (corresponding to [CLS]) is
+            # used as as the "sentence vector". Note that this only makes sense because
+            # the entire model is fine-tuned.
+            tokens = []
+            input_type_ids = []
+            tokens.append("[CLS]")
+            input_type_ids.append(0)
+            for token in tokens_a:
+                tokens.append(token)
+                input_type_ids.append(0)
+            tokens.append("[SEP]")
+            input_type_ids.append(0)
+
+            # Where "input_ids" are tokens's index in vocabulary
+            input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+            # The mask has 1 for real tokens and 0 for padding tokens. Only real
+            # tokens are attended to.
+            input_mask = [1] * len(input_ids)
+            input_masks.append(input_mask)
+            # Zero-pad up to the sequence length.
+            while len(input_ids) < seq_length:
+                input_ids.append(0)
+                input_mask.append(0)
+                input_type_ids.append(0)
+
+            assert len(input_ids) == seq_length
+            assert len(input_mask) == seq_length
+            assert len(input_type_ids) == seq_length
+
+            if ex_index < 5:
+                tf.logging.info("*** Example ***")
+                tf.logging.info("unique_id: %s" % (example.unique_id))
+                tf.logging.info("tokens: %s" % " ".join(
+                    [tokenization.printable_text(x) for x in tokens]))
+                tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+                tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+                tf.logging.info(
+                    "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
+
+            yield InputFeatures(
+                unique_id=example.unique_id,
+                tokens=tokens,
+                input_ids=input_ids,
+                input_mask=input_mask,
+                input_type_ids=input_type_ids)
+
+    def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
+        """Truncates a sequence pair in place to the maximum length."""
+
+        # This is a simple heuristic which will always truncate the longer sequence
+        # one token at a time. This makes more sense than truncating an equal percent
+        # of tokens from each, since if one sequence is very short then each token
+        # that's truncated likely contains more information than a longer sequence.
+        while True:
+            total_length = len(tokens_a) + len(tokens_b)
+            if total_length <= max_length:
+                break
+            if len(tokens_a) > len(tokens_b):
+                tokens_a.pop()
+            else:
+                tokens_b.pop()
+
+    @staticmethod
+    def _to_example(sentences):
+        import re
+        """
+        sentences to InputExample
+        :param sentences: list of strings
+        :return: list of InputExample
+        """
+        unique_id = 0
+        for ss in sentences:
+            line = tokenization.convert_to_unicode(ss)
+            if not line:
+                continue
+            line = line.strip()
+            text_a = None
+            text_b = None
+            m = re.match(r"^(.*) \|\|\| (.*)$", line)
+            if m is None:
+                text_a = line
+            else:
+                text_a = m.group(1)
+                text_b = m.group(2)
+            yield InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)
+            unique_id += 1
+
+
+if __name__ == "__main__":
+    import time
+
+    bert = BertVector()
+    while True:
+        question = input('question: ')
+        start = time.time()
+        vectors = bert.encode([question])
+        print(str(vectors))
+        print(f'predict time:----------{time.time() - start}')
--- a/extract_features_or.py
+++ b/extract_features_or.py
@ -0,0 +1,427 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Extract pre-computed feature vectors from BERT."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+import os
+
+sys.path.append(os.path.join(os.path.dirname(__file__), '../'))
+import re
+
+import modeling
+import tokenization
+import tensorflow as tf
+
+flags = tf.flags
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("input_file", None, "")
+
+flags.DEFINE_string("output_file", None, "")
+
+flags.DEFINE_string("layers", "-1,-2,-3,-4", "")
+
+flags.DEFINE_string(
+    "bert_config_file", None,
+    "The config json file corresponding to the pre-trained BERT model. "
+    "This specifies the model architecture.")
+
+flags.DEFINE_integer(
+    "max_seq_length", 128,
+    "The maximum total input sequence length after WordPiece tokenization. "
+    "Sequences longer than this will be truncated, and sequences shorter "
+    "than this will be padded.")
+
+flags.DEFINE_string(
+    "init_checkpoint", None,
+    "Initial checkpoint (usually from a pre-trained BERT model).")
+
+flags.DEFINE_string("vocab_file", None,
+                    "The vocabulary file that the BERT model was trained on.")
+
+flags.DEFINE_bool(
+    "do_lower_case", True,
+    "Whether to lower case the input text. Should be True for uncased "
+    "models and False for cased models.")
+
+flags.DEFINE_integer("batch_size", 32, "Batch size for predictions.")
+
+flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")
+
+flags.DEFINE_string("master", None,
+                    "If using a TPU, the address of the master.")
+
+flags.DEFINE_integer(
+    "num_tpu_cores", 8,
+    "Only used if `use_tpu` is True. Total number of TPU cores to use.")
+
+flags.DEFINE_bool(
+    "use_one_hot_embeddings", False,
+    "If True, tf.one_hot will be used for embedding lookups, otherwise "
+    "tf.nn.embedding_lookup will be used. On TPUs, this should be True "
+    "since it is much faster.")
+
+
+class InputExample(object):
+
+    def __init__(self, unique_id, text_a, text_b):
+        self.unique_id = unique_id
+        self.text_a = text_a
+        self.text_b = text_b
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
+        self.unique_id = unique_id
+        self.tokens = tokens
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.input_type_ids = input_type_ids
+
+
+def input_fn_builder(features, seq_length):
+    """Creates an `input_fn` closure to be passed to TPUEstimator."""
+
+    all_unique_ids = []
+    all_input_ids = []
+    all_input_mask = []
+    all_input_type_ids = []
+
+    for feature in features:
+        all_unique_ids.append(feature.unique_id)
+        all_input_ids.append(feature.input_ids)
+        all_input_mask.append(feature.input_mask)
+        all_input_type_ids.append(feature.input_type_ids)
+
+    def input_fn(params):
+        """The actual input function."""
+        batch_size = params["batch_size"]
+
+        num_examples = len(features)
+
+        # This is for demo purposes and does NOT scale to large data sets. We do
+        # not use Dataset.from_generator() because that uses tf.py_func which is
+        # not TPU compatible. The right way to load data is with TFRecordReader.
+        d = tf.data.Dataset.from_tensor_slices({
+            "unique_ids":
+                tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32),
+            "input_ids":
+                tf.constant(
+                    all_input_ids, shape=[num_examples, seq_length],
+                    dtype=tf.int32),
+            "input_mask":
+                tf.constant(
+                    all_input_mask,
+                    shape=[num_examples, seq_length],
+                    dtype=tf.int32),
+            "input_type_ids":
+                tf.constant(
+                    all_input_type_ids,
+                    shape=[num_examples, seq_length],
+                    dtype=tf.int32),
+        })
+
+        d = d.batch(batch_size=batch_size, drop_remainder=False)
+        return d
+
+    return input_fn
+
+
+def model_fn_builder(bert_config, init_checkpoint, layer_indexes, use_tpu,
+                     use_one_hot_embeddings):
+    """Returns `model_fn` closure for TPUEstimator."""
+
+    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
+        """The `model_fn` for TPUEstimator."""
+
+        unique_ids = features["unique_ids"]
+        input_ids = features["input_ids"]
+        input_mask = features["input_mask"]
+        input_type_ids = features["input_type_ids"]
+
+        model = modeling.BertModel(
+            config=bert_config,
+            is_training=False,
+            input_ids=input_ids,
+            input_mask=input_mask,
+            token_type_ids=input_type_ids,
+            use_one_hot_embeddings=use_one_hot_embeddings)
+
+        if mode != tf.estimator.ModeKeys.PREDICT:
+            raise ValueError("Only PREDICT modes are supported: %s" % (mode))
+
+        tvars = tf.trainable_variables()
+        scaffold_fn = None
+        (assignment_map,
+         initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(
+            tvars, init_checkpoint)
+        if use_tpu:
+
+            def tpu_scaffold():
+                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+                return tf.train.Scaffold()
+
+            scaffold_fn = tpu_scaffold
+        else:
+            tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+
+        tf.logging.info("**** Trainable Variables ****")
+        for var in tvars:
+            init_string = ""
+            if var.name in initialized_variable_names:
+                init_string = ", *INIT_FROM_CKPT*"
+            tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
+                            init_string)
+
+        all_layers = model.get_all_encoder_layers()
+
+        predictions = {
+            "unique_id": unique_ids,
+        }
+
+        for (i, layer_index) in enumerate(layer_indexes):
+            predictions["layer_output_%d" % i] = all_layers[layer_index]
+
+        output_spec = tf.contrib.tpu.TPUEstimatorSpec(
+            mode=mode, predictions=predictions, scaffold_fn=scaffold_fn)
+        return output_spec
+
+    return model_fn
+
+
+def convert_examples_to_features(examples, seq_length, tokenizer):
+    """Loads a data file into a list of `InputBatch`s."""
+
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        tokens_a = tokenizer.tokenize(example.text_a)
+
+        tokens_b = None
+        if example.text_b:
+            tokens_b = tokenizer.tokenize(example.text_b)
+
+        if tokens_b:
+            # Modifies `tokens_a` and `tokens_b` in place so that the total
+            # length is less than the specified length.
+            # Account for [CLS], [SEP], [SEP] with "- 3"
+            _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
+        else:
+            # Account for [CLS] and [SEP] with "- 2"
+            if len(tokens_a) > seq_length - 2:
+                tokens_a = tokens_a[0:(seq_length - 2)]
+
+        # The convention in BERT is:
+        # (a) For sequence pairs:
+        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+        #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
+        # (b) For single sequences:
+        #  tokens:   [CLS] the dog is hairy . [SEP]
+        #  type_ids: 0     0   0   0  0     0 0
+        #
+        # Where "type_ids" are used to indicate whether this is the first
+        # sequence or the second sequence. The embedding vectors for `type=0` and
+        # `type=1` were learned during pre-training and are added to the wordpiece
+        # embedding vector (and position vector). This is not *strictly* necessary
+        # since the [SEP] token unambiguously separates the sequences, but it makes
+        # it easier for the model to learn the concept of sequences.
+        #
+        # For classification tasks, the first vector (corresponding to [CLS]) is
+        # used as as the "sentence vector". Note that this only makes sense because
+        # the entire model is fine-tuned.
+        tokens = []
+        input_type_ids = []
+        tokens.append("[CLS]")
+        input_type_ids.append(0)
+        for token in tokens_a:
+            tokens.append(token)
+            input_type_ids.append(0)
+        tokens.append("[SEP]")
+        input_type_ids.append(0)
+
+        if tokens_b:
+            for token in tokens_b:
+                tokens.append(token)
+                input_type_ids.append(1)
+            tokens.append("[SEP]")
+            input_type_ids.append(1)
+
+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
+        # tokens are attended to.
+        input_mask = [1] * len(input_ids)
+
+        # Zero-pad up to the sequence length.
+        while len(input_ids) < seq_length:
+            input_ids.append(0)
+            input_mask.append(0)
+            input_type_ids.append(0)
+
+        assert len(input_ids) == seq_length
+        assert len(input_mask) == seq_length
+        assert len(input_type_ids) == seq_length
+
+        if ex_index < 5:
+            tf.logging.info("*** Example ***")
+            tf.logging.info("unique_id: %s" % (example.unique_id))
+            tf.logging.info("tokens: %s" % " ".join(
+                [tokenization.printable_text(x) for x in tokens]))
+            tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+            tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+            tf.logging.info(
+                "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
+
+        features.append(
+            InputFeatures(
+                unique_id=example.unique_id,
+                tokens=tokens,
+                input_ids=input_ids,
+                input_mask=input_mask,
+                input_type_ids=input_type_ids))
+    return features
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+
+    # This is a simple heuristic which will always truncate the longer sequence
+    # one token at a time. This makes more sense than truncating an equal percent
+    # of tokens from each, since if one sequence is very short then each token
+    # that's truncated likely contains more information than a longer sequence.
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+
+
+def read_examples(input_file):
+    """Read a list of `InputExample`s from an input file."""
+    examples = []
+    unique_id = 0
+    with tf.gfile.GFile(input_file, "r") as reader:
+        while True:
+            line = tokenization.convert_to_unicode(reader.readline())
+            if not line:
+                break
+            line = line.strip()
+            text_a = None
+            text_b = None
+            m = re.match(r"^(.*) \|\|\| (.*)$", line)
+            if m is None:
+                text_a = line
+            else:
+                text_a = m.group(1)
+                text_b = m.group(2)
+            examples.append(
+                InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
+            unique_id += 1
+    return examples
+
+
+def main(_):
+    tf.logging.set_verbosity(tf.logging.INFO)
+
+    layer_indexes = [int(x) for x in FLAGS.layers.split(",")]
+
+    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
+
+    tokenizer = tokenization.FullTokenizer(
+        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
+
+    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
+    run_config = tf.contrib.tpu.RunConfig(
+        master=FLAGS.master,
+        tpu_config=tf.contrib.tpu.TPUConfig(
+            num_shards=FLAGS.num_tpu_cores,
+            per_host_input_for_training=is_per_host))
+
+    examples = read_examples(FLAGS.input_file)
+
+    features = convert_examples_to_features(
+        examples=examples, seq_length=FLAGS.max_seq_length, tokenizer=tokenizer)
+
+    unique_id_to_feature = {}
+    for feature in features:
+        unique_id_to_feature[feature.unique_id] = feature
+
+    model_fn = model_fn_builder(
+        bert_config=bert_config,
+        init_checkpoint=FLAGS.init_checkpoint,
+        layer_indexes=layer_indexes,
+        use_tpu=FLAGS.use_tpu,
+        use_one_hot_embeddings=FLAGS.use_one_hot_embeddings)
+
+    # If TPU is not available, this will fall back to normal Estimator on CPU
+    # or GPU.
+    estimator = tf.contrib.tpu.TPUEstimator(
+        use_tpu=FLAGS.use_tpu,
+        model_fn=model_fn,
+        config=run_config,
+        predict_batch_size=FLAGS.batch_size)
+
+    input_fn = input_fn_builder(
+        features=features, seq_length=FLAGS.max_seq_length)
+
+    # return estimator.predict(input_fn, yield_single_examples=True)
+    for result in estimator.predict(input_fn, yield_single_examples=True):
+        tf.logging.info(str(result))
+    # with open('bert_feature.txt', 'r')as f:
+    #     f.write(str(result))
+    #     f.write('\n')
+
+    # with codecs.getwriter("utf-8")(tf.gfile.Open(FLAGS.output_file,
+    #                                              "w")) as writer:
+    #     for result in estimator.predict(input_fn, yield_single_examples=True):
+    #         unique_id = int(result["unique_id"])
+    #         feature = unique_id_to_feature[unique_id]
+    #         output_json = collections.OrderedDict()
+    #         output_json["linex_index"] = unique_id
+    #         all_features = []
+    #         for (i, token) in enumerate(feature.tokens):
+    #             all_layers = []
+    #             for (j, layer_index) in enumerate(layer_indexes):
+    #                 layer_output = result["layer_output_%d" % j]
+    #                 layers = collections.OrderedDict()
+    #                 layers["index"] = layer_index
+    #                 layers["values"] = [
+    #                     round(float(x), 6) for x in layer_output[i:(i + 1)].flat
+    #                 ]
+    #                 all_layers.append(layers)
+    #             features = collections.OrderedDict()
+    #             features["token"] = token
+    #             features["layers"] = all_layers
+    #             all_features.append(features)
+    #         output_json["features"] = all_features
+    #         writer.write(json.dumps(output_json) + "\n")
+
+
+if __name__ == "__main__":
+    flags.mark_flag_as_required("input_file")
+    flags.mark_flag_as_required("vocab_file")
+    flags.mark_flag_as_required("bert_config_file")
+    flags.mark_flag_as_required("init_checkpoint")
+    flags.mark_flag_as_required("output_file")
+    tf.app.run()
--- a/graph.py
+++ b/graph.py
@ -0,0 +1,124 @@
+import os
+import tempfile
+import json
+import logging
+from termcolor import colored
+import modeling
+import args
+import contextlib
+
+
+def import_tf(device_id=-1, verbose=False):
+    os.environ['CUDA_VISIBLE_DEVICES'] = '-1' if device_id < 0 else str(device_id)
+    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0' if verbose else '3'
+    import tensorflow as tf
+    tf.logging.set_verbosity(tf.logging.DEBUG if verbose else tf.logging.ERROR)
+    return tf
+
+
+def set_logger(context, verbose=False):
+    logger = logging.getLogger(context)
+    logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+    formatter = logging.Formatter(
+        '%(levelname)-.1s:' + context + ':[%(filename).5s:%(funcName).3s:%(lineno)3d]:%(message)s', datefmt=
+        '%m-%d %H:%M:%S')
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(logging.DEBUG if verbose else logging.INFO)
+    console_handler.setFormatter(formatter)
+    logger.handlers = []
+    logger.addHandler(console_handler)
+    return logger
+
+
+def optimize_graph(logger=None, verbose=False):
+    if not logger:
+        logger = set_logger(colored('BERT_VEC', 'yellow'), verbose)
+    try:
+        # we don't need GPU for optimizing the graph
+        tf = import_tf(device_id=0, verbose=verbose)
+        from tensorflow.python.tools.optimize_for_inference_lib import optimize_for_inference
+
+        # allow_soft_placement:自动选择运行设备
+        config = tf.ConfigProto(allow_soft_placement=True)
+        config_fp = args.config_name
+        init_checkpoint = args.ckpt_name
+        logger.info('model config: %s' % config_fp)
+
+        # 加载bert配置文件
+        with tf.gfile.GFile(config_fp, 'r') as f:
+            bert_config = modeling.BertConfig.from_dict(json.load(f))
+
+        logger.info('build graph...')
+        # input placeholders, not sure if they are friendly to XLA
+        input_ids = tf.placeholder(tf.int32, (None, args.max_seq_len), 'input_ids')
+        input_mask = tf.placeholder(tf.int32, (None, args.max_seq_len), 'input_mask')
+        input_type_ids = tf.placeholder(tf.int32, (None, args.max_seq_len), 'input_type_ids')
+
+        # xla加速
+        jit_scope = tf.contrib.compiler.jit.experimental_jit_scope if args.xla else contextlib.suppress
+
+        with jit_scope():
+            input_tensors = [input_ids, input_mask, input_type_ids]
+
+            model = modeling.BertModel(
+                config=bert_config,
+                is_training=False,
+                input_ids=input_ids,
+                input_mask=input_mask,
+                token_type_ids=input_type_ids,
+                use_one_hot_embeddings=False)
+
+            # 获取所有要训练的变量
+            tvars = tf.trainable_variables()
+
+            (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars,
+                                                                                                       init_checkpoint)
+
+            tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+
+            minus_mask = lambda x, m: x - tf.expand_dims(1.0 - m, axis=-1) * 1e30
+            mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
+            masked_reduce_max = lambda x, m: tf.reduce_max(minus_mask(x, m), axis=1)
+            masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / (
+                    tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
+
+            # 共享卷积核
+            with tf.variable_scope("pooling"):
+                # 如果只有一层，就只取对应那一层的weight
+                if len(args.layer_indexes) == 1:
+                    encoder_layer = model.all_encoder_layers[args.layer_indexes[0]]
+                else:
+                    # 否则遍历需要取的层，把所有层的weight取出来并拼接起来shape:768*层数
+                    all_layers = [model.all_encoder_layers[l] for l in args.layer_indexes]
+                    encoder_layer = tf.concat(all_layers, -1)
+
+                input_mask = tf.cast(input_mask, tf.float32)
+
+            # 以下代码是句向量的生成方法，可以理解为做了一个卷积的操作，但是没有把结果相加, 卷积核是input_mask
+            pooled = masked_reduce_mean(encoder_layer, input_mask)
+            pooled = tf.identity(pooled, 'final_encodes')
+
+            output_tensors = [pooled]
+            tmp_g = tf.get_default_graph().as_graph_def()
+
+        with tf.Session(config=config) as sess:
+            logger.info('load parameters from checkpoint...')
+            sess.run(tf.global_variables_initializer())
+            logger.info('freeze...')
+            tmp_g = tf.graph_util.convert_variables_to_constants(sess, tmp_g, [n.name[:-2] for n in output_tensors])
+            dtypes = [n.dtype for n in input_tensors]
+            logger.info('optimize...')
+            tmp_g = optimize_for_inference(
+                tmp_g,
+                [n.name[:-2] for n in input_tensors],
+                [n.name[:-2] for n in output_tensors],
+                [dtype.as_datatype_enum for dtype in dtypes],
+                False)
+        tmp_file = tempfile.NamedTemporaryFile('w', delete=False).name
+        logger.info('write graph to a tmp file: %s' % tmp_file)
+        with tf.gfile.GFile(tmp_file, 'wb') as f:
+            f.write(tmp_g.SerializeToString())
+        return tmp_file
+    except Exception as e:
+        logger.error('fail to optimize the graph!')
+        logger.error(e)
--- a/modeling.py
+++ b/modeling.py
@ -0,0 +1,988 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""The main BERT model and related functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import copy
+import json
+import math
+import re
+import six
+import tensorflow as tf
+
+
+class BertConfig(object):
+  """Configuration for `BertModel`."""
+
+  def __init__(self,
+               vocab_size,
+               hidden_size=768,
+               num_hidden_layers=12,
+               num_attention_heads=12,
+               intermediate_size=3072,
+               hidden_act="gelu",
+               hidden_dropout_prob=0.1,
+               attention_probs_dropout_prob=0.1,
+               max_position_embeddings=512,
+               type_vocab_size=16,
+               initializer_range=0.02):
+    """Constructs BertConfig.
+
+    Args:
+      vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
+      hidden_size: Size of the encoder layers and the pooler layer.
+      num_hidden_layers: Number of hidden layers in the Transformer encoder.
+      num_attention_heads: Number of attention heads for each attention layer in
+        the Transformer encoder.
+      intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+        layer in the Transformer encoder.
+      hidden_act: The non-linear activation function (function or string) in the
+        encoder and pooler.
+      hidden_dropout_prob: The dropout probability for all fully connected
+        layers in the embeddings, encoder, and pooler.
+      attention_probs_dropout_prob: The dropout ratio for the attention
+        probabilities.
+      max_position_embeddings: The maximum sequence length that this model might
+        ever be used with. Typically set this to something large just in case
+        (e.g., 512 or 1024 or 2048).
+      type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+        `BertModel`.
+      initializer_range: The stdev of the truncated_normal_initializer for
+        initializing all weight matrices.
+    """
+    self.vocab_size = vocab_size
+    self.hidden_size = hidden_size
+    self.num_hidden_layers = num_hidden_layers
+    self.num_attention_heads = num_attention_heads
+    self.hidden_act = hidden_act
+    self.intermediate_size = intermediate_size
+    self.hidden_dropout_prob = hidden_dropout_prob
+    self.attention_probs_dropout_prob = attention_probs_dropout_prob
+    self.max_position_embeddings = max_position_embeddings
+    self.type_vocab_size = type_vocab_size
+    self.initializer_range = initializer_range
+
+  @classmethod
+  def from_dict(cls, json_object):
+    """Constructs a `BertConfig` from a Python dictionary of parameters."""
+    config = BertConfig(vocab_size=None)
+    for (key, value) in six.iteritems(json_object):
+      config.__dict__[key] = value
+    return config
+
+  @classmethod
+  def from_json_file(cls, json_file):
+    """Constructs a `BertConfig` from a json file of parameters."""
+    with tf.gfile.GFile(json_file, "r") as reader:
+      text = reader.read()
+    return cls.from_dict(json.loads(text))
+
+  def to_dict(self):
+    """Serializes this instance to a Python dictionary."""
+    output = copy.deepcopy(self.__dict__)
+    return output
+
+  def to_json_string(self):
+    """Serializes this instance to a JSON string."""
+    return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
+
+class BertModel(object):
+  """BERT model ("Bidirectional Embedding Representations from a Transformer").
+
+  Example usage:
+
+  ```python
+  # Already been converted into WordPiece token ids
+  input_ids = tf.constant([[31, 51, 99], [15, 5, 0]])
+  input_mask = tf.constant([[1, 1, 1], [1, 1, 0]])
+  token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]])
+
+  config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
+    num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
+
+  model = modeling.BertModel(config=config, is_training=True,
+    input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids)
+
+  label_embeddings = tf.get_variable(...)
+  pooled_output = model.get_pooled_output()
+  logits = tf.matmul(pooled_output, label_embeddings)
+  ...
+  ```
+  """
+
+  def __init__(self,
+               config,
+               is_training,
+               input_ids,
+               input_mask=None,
+               token_type_ids=None,
+               use_one_hot_embeddings=True,
+               scope=None):
+    """Constructor for BertModel.
+
+    Args:
+      config: `BertConfig` instance.
+      is_training: bool. rue for training model, false for eval model. Controls
+        whether dropout will be applied.
+      input_ids: int32 Tensor of shape [batch_size, seq_length].
+      input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
+      token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
+      use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
+        embeddings or tf.embedding_lookup() for the word embeddings. On the TPU,
+        it is must faster if this is True, on the CPU or GPU, it is faster if
+        this is False.
+      scope: (optional) variable scope. Defaults to "bert".
+
+    Raises:
+      ValueError: The config is invalid or one of the input tensor shapes
+        is invalid.
+    """
+    config = copy.deepcopy(config)
+    if not is_training:
+      config.hidden_dropout_prob = 0.0
+      config.attention_probs_dropout_prob = 0.0
+
+    input_shape = get_shape_list(input_ids, expected_rank=2)
+    batch_size = input_shape[0]
+    seq_length = input_shape[1]
+
+    if input_mask is None:
+      input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32)
+
+    if token_type_ids is None:
+      token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32)
+
+    with tf.variable_scope(scope, default_name="bert"):
+      with tf.variable_scope("embeddings"):
+        # Perform embedding lookup on the word ids.
+        (self.embedding_output, self.embedding_table) = embedding_lookup(
+            input_ids=input_ids,
+            vocab_size=config.vocab_size,
+            embedding_size=config.hidden_size,
+            initializer_range=config.initializer_range,
+            word_embedding_name="word_embeddings",
+            use_one_hot_embeddings=use_one_hot_embeddings)
+
+        # Add positional embeddings and token type embeddings, then layer
+        # normalize and perform dropout.
+        self.embedding_output = embedding_postprocessor(
+            input_tensor=self.embedding_output,
+            use_token_type=True,
+            token_type_ids=token_type_ids,
+            token_type_vocab_size=config.type_vocab_size,
+            token_type_embedding_name="token_type_embeddings",
+            use_position_embeddings=True,
+            position_embedding_name="position_embeddings",
+            initializer_range=config.initializer_range,
+            max_position_embeddings=config.max_position_embeddings,
+            dropout_prob=config.hidden_dropout_prob)
+
+      with tf.variable_scope("encoder"):
+        # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
+        # mask of shape [batch_size, seq_length, seq_length] which is used
+        # for the attention scores.
+        attention_mask = create_attention_mask_from_input_mask(
+            input_ids, input_mask)
+
+        # Run the stacked transformer.
+        # `sequence_output` shape = [batch_size, seq_length, hidden_size].
+        self.all_encoder_layers = transformer_model(
+            input_tensor=self.embedding_output,
+            attention_mask=attention_mask,
+            hidden_size=config.hidden_size,
+            num_hidden_layers=config.num_hidden_layers,
+            num_attention_heads=config.num_attention_heads,
+            intermediate_size=config.intermediate_size,
+            intermediate_act_fn=get_activation(config.hidden_act),
+            hidden_dropout_prob=config.hidden_dropout_prob,
+            attention_probs_dropout_prob=config.attention_probs_dropout_prob,
+            initializer_range=config.initializer_range,
+            do_return_all_layers=True)
+
+      self.sequence_output = self.all_encoder_layers[-1]
+      # The "pooler" converts the encoded sequence tensor of shape
+      # [batch_size, seq_length, hidden_size] to a tensor of shape
+      # [batch_size, hidden_size]. This is necessary for segment-level
+      # (or segment-pair-level) classification tasks where we need a fixed
+      # dimensional representation of the segment.
+      with tf.variable_scope("pooler"):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token. We assume that this has been pre-trained
+        first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1)
+        self.pooled_output = tf.layers.dense(
+            first_token_tensor,
+            config.hidden_size,
+            activation=tf.tanh,
+            kernel_initializer=create_initializer(config.initializer_range))
+
+  def get_pooled_output(self):
+    return self.pooled_output
+
+  def get_sequence_output(self):
+    """Gets final hidden layer of encoder.
+
+    Returns:
+      float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
+      to the final hidden of the transformer encoder.
+    """
+    return self.sequence_output
+
+  def get_all_encoder_layers(self):
+    return self.all_encoder_layers
+
+  def get_embedding_output(self):
+    """Gets output of the embedding lookup (i.e., input to the transformer).
+
+    Returns:
+      float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
+      to the output of the embedding layer, after summing the word
+      embeddings with the positional embeddings and the token type embeddings,
+      then performing layer normalization. This is the input to the transformer.
+    """
+    return self.embedding_output
+
+  def get_embedding_table(self):
+    return self.embedding_table
+
+
+def gelu(input_tensor):
+  """Gaussian Error Linear Unit.
+
+  This is a smoother version of the RELU.
+  Original paper: https://arxiv.org/abs/1606.08415
+
+  Args:
+    input_tensor: float Tensor to perform activation.
+
+  Returns:
+    `input_tensor` with the GELU activation applied.
+  """
+  cdf = 0.5 * (1.0 + tf.erf(input_tensor / tf.sqrt(2.0)))
+  return input_tensor * cdf
+
+
+def get_activation(activation_string):
+  """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`.
+
+  Args:
+    activation_string: String name of the activation function.
+
+  Returns:
+    A Python function corresponding to the activation function. If
+    `activation_string` is None, empty, or "linear", this will return None.
+    If `activation_string` is not a string, it will return `activation_string`.
+
+  Raises:
+    ValueError: The `activation_string` does not correspond to a known
+      activation.
+  """
+
+  # We assume that anything that"s not a string is already an activation
+  # function, so we just return it.
+  if not isinstance(activation_string, six.string_types):
+    return activation_string
+
+  if not activation_string:
+    return None
+
+  act = activation_string.lower()
+  if act == "linear":
+    return None
+  elif act == "relu":
+    return tf.nn.relu
+  elif act == "gelu":
+    return gelu
+  elif act == "tanh":
+    return tf.tanh
+  else:
+    raise ValueError("Unsupported activation: %s" % act)
+
+
+def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
+  """Compute the union of the current variables and checkpoint variables."""
+  assignment_map = {}
+  initialized_variable_names = {}
+
+  name_to_variable = collections.OrderedDict()
+  for var in tvars:
+    name = var.name
+    m = re.match("^(.*):\\d+$", name)
+    if m is not None:
+      name = m.group(1)
+    name_to_variable[name] = var
+
+  init_vars = tf.train.list_variables(init_checkpoint)
+
+  assignment_map = collections.OrderedDict()
+  for x in init_vars:
+    (name, var) = (x[0], x[1])
+    if name not in name_to_variable:
+      continue
+    assignment_map[name] = name
+    initialized_variable_names[name] = 1
+    initialized_variable_names[name + ":0"] = 1
+
+  return (assignment_map, initialized_variable_names)
+
+
+def dropout(input_tensor, dropout_prob):
+  """Perform dropout.
+
+  Args:
+    input_tensor: float Tensor.
+    dropout_prob: Python float. The probability of dropping out a value (NOT of
+      *keeping* a dimension as in `tf.nn.dropout`).
+
+  Returns:
+    A version of `input_tensor` with dropout applied.
+  """
+  if dropout_prob is None or dropout_prob == 0.0:
+    return input_tensor
+
+  output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob)
+  return output
+
+
+def layer_norm(input_tensor, name=None):
+  """Run layer normalization on the last dimension of the tensor."""
+  return tf.contrib.layers.layer_norm(
+      inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name)
+
+
+def layer_norm_and_dropout(input_tensor, dropout_prob, name=None):
+  """Runs layer normalization followed by dropout."""
+  output_tensor = layer_norm(input_tensor, name)
+  output_tensor = dropout(output_tensor, dropout_prob)
+  return output_tensor
+
+
+def create_initializer(initializer_range=0.02):
+  """Creates a `truncated_normal_initializer` with the given range."""
+  return tf.truncated_normal_initializer(stddev=initializer_range)
+
+
+def embedding_lookup(input_ids,
+                     vocab_size,
+                     embedding_size=128,
+                     initializer_range=0.02,
+                     word_embedding_name="word_embeddings",
+                     use_one_hot_embeddings=False):
+  """Looks up words embeddings for id tensor.
+
+  Args:
+    input_ids: int32 Tensor of shape [batch_size, seq_length] containing word
+      ids.
+    vocab_size: int. Size of the embedding vocabulary.
+    embedding_size: int. Width of the word embeddings.
+    initializer_range: float. Embedding initialization range.
+    word_embedding_name: string. Name of the embedding table.
+    use_one_hot_embeddings: bool. If True, use one-hot method for word
+      embeddings. If False, use `tf.nn.embedding_lookup()`. One hot is better
+      for TPUs.
+
+  Returns:
+    float Tensor of shape [batch_size, seq_length, embedding_size].
+  """
+  # This function assumes that the input is of shape [batch_size, seq_length,
+  # num_inputs].
+  #
+  # If the input is a 2D tensor of shape [batch_size, seq_length], we
+  # reshape to [batch_size, seq_length, 1].
+  if input_ids.shape.ndims == 2:
+    input_ids = tf.expand_dims(input_ids, axis=[-1])
+
+  embedding_table = tf.get_variable(
+      name=word_embedding_name,
+      shape=[vocab_size, embedding_size],
+      initializer=create_initializer(initializer_range))
+
+  if use_one_hot_embeddings:
+    flat_input_ids = tf.reshape(input_ids, [-1])
+    one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size)
+    output = tf.matmul(one_hot_input_ids, embedding_table)
+  else:
+    output = tf.nn.embedding_lookup(embedding_table, input_ids)
+
+  input_shape = get_shape_list(input_ids)
+
+  output = tf.reshape(output,
+                      input_shape[0:-1] + [input_shape[-1] * embedding_size])
+  return (output, embedding_table)
+
+
+def embedding_postprocessor(input_tensor,
+                            use_token_type=False,
+                            token_type_ids=None,
+                            token_type_vocab_size=16,
+                            token_type_embedding_name="token_type_embeddings",
+                            use_position_embeddings=True,
+                            position_embedding_name="position_embeddings",
+                            initializer_range=0.02,
+                            max_position_embeddings=512,
+                            dropout_prob=0.1):
+  """Performs various post-processing on a word embedding tensor.
+
+  Args:
+    input_tensor: float Tensor of shape [batch_size, seq_length,
+      embedding_size].
+    use_token_type: bool. Whether to add embeddings for `token_type_ids`.
+    token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
+      Must be specified if `use_token_type` is True.
+    token_type_vocab_size: int. The vocabulary size of `token_type_ids`.
+    token_type_embedding_name: string. The name of the embedding table variable
+      for token type ids.
+    use_position_embeddings: bool. Whether to add position embeddings for the
+      position of each token in the sequence.
+    position_embedding_name: string. The name of the embedding table variable
+      for positional embeddings.
+    initializer_range: float. Range of the weight initialization.
+    max_position_embeddings: int. Maximum sequence length that might ever be
+      used with this model. This can be longer than the sequence length of
+      input_tensor, but cannot be shorter.
+    dropout_prob: float. Dropout probability applied to the final output tensor.
+
+  Returns:
+    float tensor with same shape as `input_tensor`.
+
+  Raises:
+    ValueError: One of the tensor shapes or input values is invalid.
+  """
+  input_shape = get_shape_list(input_tensor, expected_rank=3)
+  batch_size = input_shape[0]
+  seq_length = input_shape[1]
+  width = input_shape[2]
+
+  output = input_tensor
+
+  if use_token_type:
+    if token_type_ids is None:
+      raise ValueError("`token_type_ids` must be specified if"
+                       "`use_token_type` is True.")
+    token_type_table = tf.get_variable(
+        name=token_type_embedding_name,
+        shape=[token_type_vocab_size, width],
+        initializer=create_initializer(initializer_range))
+    # This vocab will be small so we always do one-hot here, since it is always
+    # faster for a small vocabulary.
+    flat_token_type_ids = tf.reshape(token_type_ids, [-1])
+    one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size)
+    token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
+    token_type_embeddings = tf.reshape(token_type_embeddings,
+                                       [batch_size, seq_length, width])
+    output += token_type_embeddings
+
+  if use_position_embeddings:
+    assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
+    with tf.control_dependencies([assert_op]):
+      full_position_embeddings = tf.get_variable(
+          name=position_embedding_name,
+          shape=[max_position_embeddings, width],
+          initializer=create_initializer(initializer_range))
+      # Since the position embedding table is a learned variable, we create it
+      # using a (long) sequence length `max_position_embeddings`. The actual
+      # sequence length might be shorter than this, for faster training of
+      # tasks that do not have long sequences.
+      #
+      # So `full_position_embeddings` is effectively an embedding table
+      # for position [0, 1, 2, ..., max_position_embeddings-1], and the current
+      # sequence has positions [0, 1, 2, ... seq_length-1], so we can just
+      # perform a slice.
+      position_embeddings = tf.slice(full_position_embeddings, [0, 0],
+                                     [seq_length, -1])
+      num_dims = len(output.shape.as_list())
+
+      # Only the last two dimensions are relevant (`seq_length` and `width`), so
+      # we broadcast among the first dimensions, which is typically just
+      # the batch size.
+      position_broadcast_shape = []
+      for _ in range(num_dims - 2):
+        position_broadcast_shape.append(1)
+      position_broadcast_shape.extend([seq_length, width])
+      position_embeddings = tf.reshape(position_embeddings,
+                                       position_broadcast_shape)
+      output += position_embeddings
+
+  output = layer_norm_and_dropout(output, dropout_prob)
+  return output
+
+
+def create_attention_mask_from_input_mask(from_tensor, to_mask):
+  """Create 3D attention mask from a 2D tensor mask.
+
+  Args:
+    from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...].
+    to_mask: int32 Tensor of shape [batch_size, to_seq_length].
+
+  Returns:
+    float Tensor of shape [batch_size, from_seq_length, to_seq_length].
+  """
+  from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
+  batch_size = from_shape[0]
+  from_seq_length = from_shape[1]
+
+  to_shape = get_shape_list(to_mask, expected_rank=2)
+  to_seq_length = to_shape[1]
+
+  to_mask = tf.cast(
+      tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32)
+
+  # We don't assume that `from_tensor` is a mask (although it could be). We
+  # don't actually care if we attend *from* padding tokens (only *to* padding)
+  # tokens so we create a tensor of all ones.
+  #
+  # `broadcast_ones` = [batch_size, from_seq_length, 1]
+  broadcast_ones = tf.ones(
+      shape=[batch_size, from_seq_length, 1], dtype=tf.float32)
+
+  # Here we broadcast along two dimensions to create the mask.
+  mask = broadcast_ones * to_mask
+
+  return mask
+
+
+def attention_layer(from_tensor,
+                    to_tensor,
+                    attention_mask=None,
+                    num_attention_heads=1,
+                    size_per_head=512,
+                    query_act=None,
+                    key_act=None,
+                    value_act=None,
+                    attention_probs_dropout_prob=0.0,
+                    initializer_range=0.02,
+                    do_return_2d_tensor=False,
+                    batch_size=None,
+                    from_seq_length=None,
+                    to_seq_length=None):
+  """Performs multi-headed attention from `from_tensor` to `to_tensor`.
+
+  This is an implementation of multi-headed attention based on "Attention
+  is all you Need". If `from_tensor` and `to_tensor` are the same, then
+  this is self-attention. Each timestep in `from_tensor` attends to the
+  corresponding sequence in `to_tensor`, and returns a fixed-with vector.
+
+  This function first projects `from_tensor` into a "query" tensor and
+  `to_tensor` into "key" and "value" tensors. These are (effectively) a list
+  of tensors of length `num_attention_heads`, where each tensor is of shape
+  [batch_size, seq_length, size_per_head].
+
+  Then, the query and key tensors are dot-producted and scaled. These are
+  softmaxed to obtain attention probabilities. The value tensors are then
+  interpolated by these probabilities, then concatenated back to a single
+  tensor and returned.
+
+  In practice, the multi-headed attention are done with transposes and
+  reshapes rather than actual separate tensors.
+
+  Args:
+    from_tensor: float Tensor of shape [batch_size, from_seq_length,
+      from_width].
+    to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width].
+    attention_mask: (optional) int32 Tensor of shape [batch_size,
+      from_seq_length, to_seq_length]. The values should be 1 or 0. The
+      attention scores will effectively be set to -infinity for any positions in
+      the mask that are 0, and will be unchanged for positions that are 1.
+    num_attention_heads: int. Number of attention heads.
+    size_per_head: int. Size of each attention head.
+    query_act: (optional) Activation function for the query transform.
+    key_act: (optional) Activation function for the key transform.
+    value_act: (optional) Activation function for the value transform.
+    attention_probs_dropout_prob: (optional) float. Dropout probability of the
+      attention probabilities.
+    initializer_range: float. Range of the weight initializer.
+    do_return_2d_tensor: bool. If True, the output will be of shape [batch_size
+      * from_seq_length, num_attention_heads * size_per_head]. If False, the
+      output will be of shape [batch_size, from_seq_length, num_attention_heads
+      * size_per_head].
+    batch_size: (Optional) int. If the input is 2D, this might be the batch size
+      of the 3D version of the `from_tensor` and `to_tensor`.
+    from_seq_length: (Optional) If the input is 2D, this might be the seq length
+      of the 3D version of the `from_tensor`.
+    to_seq_length: (Optional) If the input is 2D, this might be the seq length
+      of the 3D version of the `to_tensor`.
+
+  Returns:
+    float Tensor of shape [batch_size, from_seq_length,
+      num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is
+      true, this will be of shape [batch_size * from_seq_length,
+      num_attention_heads * size_per_head]).
+
+  Raises:
+    ValueError: Any of the arguments or tensor shapes are invalid.
+  """
+
+  def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
+                           seq_length, width):
+    output_tensor = tf.reshape(
+        input_tensor, [batch_size, seq_length, num_attention_heads, width])
+
+    output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
+    return output_tensor
+
+  from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
+  to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])
+
+  if len(from_shape) != len(to_shape):
+    raise ValueError(
+        "The rank of `from_tensor` must match the rank of `to_tensor`.")
+
+  if len(from_shape) == 3:
+    batch_size = from_shape[0]
+    from_seq_length = from_shape[1]
+    to_seq_length = to_shape[1]
+  elif len(from_shape) == 2:
+    if (batch_size is None or from_seq_length is None or to_seq_length is None):
+      raise ValueError(
+          "When passing in rank 2 tensors to attention_layer, the values "
+          "for `batch_size`, `from_seq_length`, and `to_seq_length` "
+          "must all be specified.")
+
+  # Scalar dimensions referenced here:
+  #   B = batch size (number of sequences)
+  #   F = `from_tensor` sequence length
+  #   T = `to_tensor` sequence length
+  #   N = `num_attention_heads`
+  #   H = `size_per_head`
+
+  from_tensor_2d = reshape_to_matrix(from_tensor)
+  to_tensor_2d = reshape_to_matrix(to_tensor)
+
+  # `query_layer` = [B*F, N*H]
+  query_layer = tf.layers.dense(
+      from_tensor_2d,
+      num_attention_heads * size_per_head,
+      activation=query_act,
+      name="query",
+      kernel_initializer=create_initializer(initializer_range))
+
+  # `key_layer` = [B*T, N*H]
+  key_layer = tf.layers.dense(
+      to_tensor_2d,
+      num_attention_heads * size_per_head,
+      activation=key_act,
+      name="key",
+      kernel_initializer=create_initializer(initializer_range))
+
+  # `value_layer` = [B*T, N*H]
+  value_layer = tf.layers.dense(
+      to_tensor_2d,
+      num_attention_heads * size_per_head,
+      activation=value_act,
+      name="value",
+      kernel_initializer=create_initializer(initializer_range))
+
+  # `query_layer` = [B, N, F, H]
+  query_layer = transpose_for_scores(query_layer, batch_size,
+                                     num_attention_heads, from_seq_length,
+                                     size_per_head)
+
+  # `key_layer` = [B, N, T, H]
+  key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads,
+                                   to_seq_length, size_per_head)
+
+  # Take the dot product between "query" and "key" to get the raw
+  # attention scores.
+  # `attention_scores` = [B, N, F, T]
+  attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
+  attention_scores = tf.multiply(attention_scores,
+                                 1.0 / math.sqrt(float(size_per_head)))
+
+  if attention_mask is not None:
+    # `attention_mask` = [B, 1, F, T]
+    attention_mask = tf.expand_dims(attention_mask, axis=[1])
+
+    # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+    # masked positions, this operation will create a tensor which is 0.0 for
+    # positions we want to attend and -10000.0 for masked positions.
+    adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0
+
+    # Since we are adding it to the raw scores before the softmax, this is
+    # effectively the same as removing these entirely.
+    attention_scores += adder
+
+  # Normalize the attention scores to probabilities.
+  # `attention_probs` = [B, N, F, T]
+  attention_probs = tf.nn.softmax(attention_scores)
+
+  # This is actually dropping out entire tokens to attend to, which might
+  # seem a bit unusual, but is taken from the original Transformer paper.
+  attention_probs = dropout(attention_probs, attention_probs_dropout_prob)
+
+  # `value_layer` = [B, T, N, H]
+  value_layer = tf.reshape(
+      value_layer,
+      [batch_size, to_seq_length, num_attention_heads, size_per_head])
+
+  # `value_layer` = [B, N, T, H]
+  value_layer = tf.transpose(value_layer, [0, 2, 1, 3])
+
+  # `context_layer` = [B, N, F, H]
+  context_layer = tf.matmul(attention_probs, value_layer)
+
+  # `context_layer` = [B, F, N, H]
+  context_layer = tf.transpose(context_layer, [0, 2, 1, 3])
+
+  if do_return_2d_tensor:
+    # `context_layer` = [B*F, N*V]
+    context_layer = tf.reshape(
+        context_layer,
+        [batch_size * from_seq_length, num_attention_heads * size_per_head])
+  else:
+    # `context_layer` = [B, F, N*V]
+    context_layer = tf.reshape(
+        context_layer,
+        [batch_size, from_seq_length, num_attention_heads * size_per_head])
+
+  return context_layer
+
+
+def transformer_model(input_tensor,
+                      attention_mask=None,
+                      hidden_size=768,
+                      num_hidden_layers=12,
+                      num_attention_heads=12,
+                      intermediate_size=3072,
+                      intermediate_act_fn=gelu,
+                      hidden_dropout_prob=0.1,
+                      attention_probs_dropout_prob=0.1,
+                      initializer_range=0.02,
+                      do_return_all_layers=False):
+  """Multi-headed, multi-layer Transformer from "Attention is All You Need".
+
+  This is almost an exact implementation of the original Transformer encoder.
+
+  See the original paper:
+  https://arxiv.org/abs/1706.03762
+
+  Also see:
+  https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py
+
+  Args:
+    input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
+    attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length,
+      seq_length], with 1 for positions that can be attended to and 0 in
+      positions that should not be.
+    hidden_size: int. Hidden size of the Transformer.
+    num_hidden_layers: int. Number of layers (blocks) in the Transformer.
+    num_attention_heads: int. Number of attention heads in the Transformer.
+    intermediate_size: int. The size of the "intermediate" (a.k.a., feed
+      forward) layer.
+    intermediate_act_fn: function. The non-linear activation function to apply
+      to the output of the intermediate/feed-forward layer.
+    hidden_dropout_prob: float. Dropout probability for the hidden layers.
+    attention_probs_dropout_prob: float. Dropout probability of the attention
+      probabilities.
+    initializer_range: float. Range of the initializer (stddev of truncated
+      normal).
+    do_return_all_layers: Whether to also return all layers or just the final
+      layer.
+
+  Returns:
+    float Tensor of shape [batch_size, seq_length, hidden_size], the final
+    hidden layer of the Transformer.
+
+  Raises:
+    ValueError: A Tensor shape or parameter is invalid.
+  """
+  if hidden_size % num_attention_heads != 0:
+    raise ValueError(
+        "The hidden size (%d) is not a multiple of the number of attention "
+        "heads (%d)" % (hidden_size, num_attention_heads))
+
+  attention_head_size = int(hidden_size / num_attention_heads)
+  input_shape = get_shape_list(input_tensor, expected_rank=3)
+  batch_size = input_shape[0]
+  seq_length = input_shape[1]
+  input_width = input_shape[2]
+
+  # The Transformer performs sum residuals on all layers so the input needs
+  # to be the same as the hidden size.
+  if input_width != hidden_size:
+    raise ValueError("The width of the input tensor (%d) != hidden size (%d)" %
+                     (input_width, hidden_size))
+
+  # We keep the representation as a 2D tensor to avoid re-shaping it back and
+  # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on
+  # the GPU/CPU but may not be free on the TPU, so we want to minimize them to
+  # help the optimizer.
+  prev_output = reshape_to_matrix(input_tensor)
+
+  all_layer_outputs = []
+  for layer_idx in range(num_hidden_layers):
+    with tf.variable_scope("layer_%d" % layer_idx):
+      layer_input = prev_output
+
+      with tf.variable_scope("attention"):
+        attention_heads = []
+        with tf.variable_scope("self"):
+          attention_head = attention_layer(
+              from_tensor=layer_input,
+              to_tensor=layer_input,
+              attention_mask=attention_mask,
+              num_attention_heads=num_attention_heads,
+              size_per_head=attention_head_size,
+              attention_probs_dropout_prob=attention_probs_dropout_prob,
+              initializer_range=initializer_range,
+              do_return_2d_tensor=True,
+              batch_size=batch_size,
+              from_seq_length=seq_length,
+              to_seq_length=seq_length)
+          attention_heads.append(attention_head)
+
+        attention_output = None
+        if len(attention_heads) == 1:
+          attention_output = attention_heads[0]
+        else:
+          # In the case where we have other sequences, we just concatenate
+          # them to the self-attention head before the projection.
+          attention_output = tf.concat(attention_heads, axis=-1)
+
+        # Run a linear projection of `hidden_size` then add a residual
+        # with `layer_input`.
+        with tf.variable_scope("output"):
+          attention_output = tf.layers.dense(
+              attention_output,
+              hidden_size,
+              kernel_initializer=create_initializer(initializer_range))
+          attention_output = dropout(attention_output, hidden_dropout_prob)
+          attention_output = layer_norm(attention_output + layer_input)
+
+      # The activation is only applied to the "intermediate" hidden layer.
+      with tf.variable_scope("intermediate"):
+        intermediate_output = tf.layers.dense(
+            attention_output,
+            intermediate_size,
+            activation=intermediate_act_fn,
+            kernel_initializer=create_initializer(initializer_range))
+
+      # Down-project back to `hidden_size` then add the residual.
+      with tf.variable_scope("output"):
+        layer_output = tf.layers.dense(
+            intermediate_output,
+            hidden_size,
+            kernel_initializer=create_initializer(initializer_range))
+        layer_output = dropout(layer_output, hidden_dropout_prob)
+        layer_output = layer_norm(layer_output + attention_output)
+        prev_output = layer_output
+        all_layer_outputs.append(layer_output)
+
+  if do_return_all_layers:
+    final_outputs = []
+    for layer_output in all_layer_outputs:
+      final_output = reshape_from_matrix(layer_output, input_shape)
+      final_outputs.append(final_output)
+    return final_outputs
+  else:
+    final_output = reshape_from_matrix(prev_output, input_shape)
+    return final_output
+
+
+def get_shape_list(tensor, expected_rank=None, name=None):
+  """Returns a list of the shape of tensor, preferring static dimensions.
+
+  Args:
+    tensor: A tf.Tensor object to find the shape of.
+    expected_rank: (optional) int. The expected rank of `tensor`. If this is
+      specified and the `tensor` has a different rank, and exception will be
+      thrown.
+    name: Optional name of the tensor for the error message.
+
+  Returns:
+    A list of dimensions of the shape of tensor. All static dimensions will
+    be returned as python integers, and dynamic dimensions will be returned
+    as tf.Tensor scalars.
+  """
+  if name is None:
+    name = tensor.name
+
+  if expected_rank is not None:
+    assert_rank(tensor, expected_rank, name)
+
+  shape = tensor.shape.as_list()
+
+  non_static_indexes = []
+  for (index, dim) in enumerate(shape):
+    if dim is None:
+      non_static_indexes.append(index)
+
+  if not non_static_indexes:
+    return shape
+
+  dyn_shape = tf.shape(tensor)
+  for index in non_static_indexes:
+    shape[index] = dyn_shape[index]
+  return shape
+
+
+def reshape_to_matrix(input_tensor):
+  """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix)."""
+  ndims = input_tensor.shape.ndims
+  if ndims < 2:
+    raise ValueError("Input tensor must have at least rank 2. Shape = %s" %
+                     (input_tensor.shape))
+  if ndims == 2:
+    return input_tensor
+
+  width = input_tensor.shape[-1]
+  output_tensor = tf.reshape(input_tensor, [-1, width])
+  return output_tensor
+
+
+def reshape_from_matrix(output_tensor, orig_shape_list):
+  """Reshapes a rank 2 tensor back to its original rank >= 2 tensor."""
+  if len(orig_shape_list) == 2:
+    return output_tensor
+
+  output_shape = get_shape_list(output_tensor)
+
+  orig_dims = orig_shape_list[0:-1]
+  width = output_shape[-1]
+
+  return tf.reshape(output_tensor, orig_dims + [width])
+
+
+def assert_rank(tensor, expected_rank, name=None):
+  """Raises an exception if the tensor rank is not of the expected rank.
+
+  Args:
+    tensor: A tf.Tensor to check the rank of.
+    expected_rank: Python integer or list of integers, expected rank.
+    name: Optional name of the tensor for the error message.
+
+  Raises:
+    ValueError: If the expected shape doesn't match the actual shape.
+  """
+  if name is None:
+    name = tensor.name
+
+  expected_rank_dict = {}
+  if isinstance(expected_rank, six.integer_types):
+    expected_rank_dict[expected_rank] = True
+  else:
+    for x in expected_rank:
+      expected_rank_dict[x] = True
+
+  actual_rank = tensor.shape.ndims
+  if actual_rank not in expected_rank_dict:
+    scope_name = tf.get_variable_scope().name
+    raise ValueError(
+        "For the tensor `%s` in scope `%s`, the actual rank "
+        "`%d` (shape = %s) is not equal to the expected rank `%s`" %
+        (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank)))
--- a/optimization.py
+++ b/optimization.py
@ -0,0 +1,171 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions and classes related to optimization (weight updates)."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+import tensorflow as tf
+
+
+def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
+  """Creates an optimizer training op."""
+  global_step = tf.train.get_or_create_global_step()
+
+  learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
+
+  # Implements linear decay of the learning rate.
+  learning_rate = tf.train.polynomial_decay(
+      learning_rate,
+      global_step,
+      num_train_steps,
+      end_learning_rate=0.0,
+      power=1.0,
+      cycle=False)
+
+  # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
+  # learning rate will be `global_step/num_warmup_steps * init_lr`.
+  if num_warmup_steps:
+    global_steps_int = tf.cast(global_step, tf.int32)
+    warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
+
+    global_steps_float = tf.cast(global_steps_int, tf.float32)
+    warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
+
+    warmup_percent_done = global_steps_float / warmup_steps_float
+    warmup_learning_rate = init_lr * warmup_percent_done
+
+    is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
+    learning_rate = (
+        (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
+
+  # It is recommended that you use this optimizer for fine tuning, since this
+  # is how the model was trained (note that the Adam m/v variables are NOT
+  # loaded from init_checkpoint.)
+  optimizer = AdamWeightDecayOptimizer(
+      learning_rate=learning_rate,
+      weight_decay_rate=0.01,
+      beta_1=0.9,
+      beta_2=0.999,
+      epsilon=1e-6,
+      exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
+
+  if use_tpu:
+    optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
+
+  tvars = tf.trainable_variables()
+  grads = tf.gradients(loss, tvars)
+
+  # This is how the model was pre-trained.
+  (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
+
+  train_op = optimizer.apply_gradients(
+      zip(grads, tvars), global_step=global_step)
+
+  new_global_step = global_step + 1
+  train_op = tf.group(train_op, [global_step.assign(new_global_step)])
+  return train_op
+
+
+class AdamWeightDecayOptimizer(tf.train.Optimizer):
+  """A basic Adam optimizer that includes "correct" L2 weight decay."""
+
+  def __init__(self,
+               learning_rate,
+               weight_decay_rate=0.0,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=1e-6,
+               exclude_from_weight_decay=None,
+               name="AdamWeightDecayOptimizer"):
+    """Constructs a AdamWeightDecayOptimizer."""
+    super(AdamWeightDecayOptimizer, self).__init__(False, name)
+
+    self.learning_rate = learning_rate
+    self.weight_decay_rate = weight_decay_rate
+    self.beta_1 = beta_1
+    self.beta_2 = beta_2
+    self.epsilon = epsilon
+    self.exclude_from_weight_decay = exclude_from_weight_decay
+
+  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
+    """See base class."""
+    assignments = []
+    for (grad, param) in grads_and_vars:
+      if grad is None or param is None:
+        continue
+
+      param_name = self._get_variable_name(param.name)
+
+      m = tf.get_variable(
+          name=param_name + "/adam_m",
+          shape=param.shape.as_list(),
+          dtype=tf.float32,
+          trainable=False,
+          initializer=tf.zeros_initializer())
+      v = tf.get_variable(
+          name=param_name + "/adam_v",
+          shape=param.shape.as_list(),
+          dtype=tf.float32,
+          trainable=False,
+          initializer=tf.zeros_initializer())
+
+      # Standard Adam update.
+      next_m = (
+          tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
+      next_v = (
+          tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
+                                                    tf.square(grad)))
+
+      update = next_m / (tf.sqrt(next_v) + self.epsilon)
+
+      # Just adding the square of the weights to the loss function is *not*
+      # the correct way of using L2 regularization/weight decay with Adam,
+      # since that will interact with the m and v parameters in strange ways.
+      #
+      # Instead we want ot decay the weights in a manner that doesn't interact
+      # with the m/v parameters. This is equivalent to adding the square
+      # of the weights to the loss with plain (non-momentum) SGD.
+      if self._do_use_weight_decay(param_name):
+        update += self.weight_decay_rate * param
+
+      update_with_lr = self.learning_rate * update
+
+      next_param = param - update_with_lr
+
+      assignments.extend(
+          [param.assign(next_param),
+           m.assign(next_m),
+           v.assign(next_v)])
+    return tf.group(*assignments, name=name)
+
+  def _do_use_weight_decay(self, param_name):
+    """Whether to use L2 weight decay for `param_name`."""
+    if not self.weight_decay_rate:
+      return False
+    if self.exclude_from_weight_decay:
+      for r in self.exclude_from_weight_decay:
+        if re.search(r, param_name) is not None:
+          return False
+    return True
+
+  def _get_variable_name(self, param_name):
+    """Get the variable name from the tensor name."""
+    m = re.match("^(.*):\\d+$", param_name)
+    if m is not None:
+      param_name = m.group(1)
+    return param_name
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
+tensorflow >= 1.11.0   # CPU Version of TensorFlow.
+# tensorflow-gpu  >= 1.11.0  # GPU version of TensorFlow.
--- a/similarity.py
+++ b/similarity.py
@ -0,0 +1,676 @@
+import os
+from queue import Queue
+from threading import Thread
+
+import pandas as pd
+import tensorflow as tf
+import collections
+import args
+import tokenization
+import modeling
+import optimization
+
+
+# os.environ['CUDA_VISIBLE_DEVICES'] = '1'
+
+
+class InputExample(object):
+    """A single training/test example for simple sequence classification."""
+
+    def __init__(self, guid, text_a, text_b=None, label=None):
+        """Constructs a InputExample.
+
+        Args:
+          guid: Unique id for the example.
+          text_a: string. The untokenized text of the first sequence. For single
+            sequence tasks, only this sequence must be specified.
+          text_b: (Optional) string. The untokenized text of the second sequence.
+            Only must be specified for sequence pair tasks.
+          label: (Optional) string. The label of the example. This should be
+            specified for train and dev examples, but not for test examples.
+        """
+        self.guid = guid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self, input_ids, input_mask, segment_ids, label_id):
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.label_id = label_id
+
+
+class DataProcessor(object):
+    """Base class for data converters for sequence classification data sets."""
+
+    def get_train_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the train set."""
+        raise NotImplementedError()
+
+    def get_dev_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the dev set."""
+        raise NotImplementedError()
+
+    def get_test_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for prediction."""
+        raise NotImplementedError()
+
+    def get_labels(self):
+        """Gets the list of labels for this data set."""
+        raise NotImplementedError()
+
+
+class SimProcessor(DataProcessor):
+    def get_train_examples(self, data_dir):
+        file_path = os.path.join(data_dir, 'train.csv')
+        train_df = pd.read_csv(file_path, encoding='utf-8')
+        train_data = []
+        for index, train in enumerate(train_df.values):
+            guid = 'train-%d' % index
+            text_a = tokenization.convert_to_unicode(str(train[0]))
+            text_b = tokenization.convert_to_unicode(str(train[1]))
+            label = str(train[2])
+            train_data.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return train_data
+
+    def get_dev_examples(self, data_dir):
+        file_path = os.path.join(data_dir, 'test.csv')
+        dev_df = pd.read_csv(file_path, encoding='utf-8')
+        dev_data = []
+        for index, dev in enumerate(dev_df.values):
+            guid = 'test-%d' % index
+            text_a = tokenization.convert_to_unicode(str(dev[0]))
+            text_b = tokenization.convert_to_unicode(str(dev[1]))
+            label = str(dev[2])
+            dev_data.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return dev_data
+
+    def get_test_examples(self, data_dir):
+        file_path = os.path.join(data_dir, 'test.csv')
+        test_df = pd.read_csv(file_path, encoding='utf-8')
+        test_data = []
+        for index, test in enumerate(test_df.values):
+            guid = 'test-%d' % index
+            text_a = tokenization.convert_to_unicode(str(test[0]))
+            text_b = tokenization.convert_to_unicode(str(test[1]))
+            label = str(test[2])
+            test_data.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return test_data
+
+    def get_sentence_examples(self, questions):
+        for index, data in enumerate(questions):
+            guid = 'test-%d' % index
+            text_a = tokenization.convert_to_unicode(str(data[0]))
+            text_b = tokenization.convert_to_unicode(str(data[1]))
+            label = str(0)
+            yield InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
+
+    def get_labels(self):
+        return ['0', '1']
+
+
+class BertSim:
+
+    def __init__(self, batch_size=args.batch_size):
+        self.mode = None
+        self.max_seq_length = args.max_seq_len
+        self.tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=True)
+        self.batch_size = batch_size
+        self.estimator = None
+        self.processor = SimProcessor()
+        tf.logging.set_verbosity(tf.logging.INFO)
+
+    def set_mode(self, mode):
+        self.mode = mode
+        self.estimator = self.get_estimator()
+        if mode == tf.estimator.ModeKeys.TRAIN:
+            self.input_queue = Queue(maxsize=1)
+            self.output_queue = Queue(maxsize=1)
+            self.predict_thread = Thread(target=self.predict_from_queue, daemon=True)
+            self.predict_thread.start()
+
+    def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
+                     labels, num_labels, use_one_hot_embeddings):
+        """Creates a classification model."""
+        model = modeling.BertModel(
+            config=bert_config,
+            is_training=is_training,
+            input_ids=input_ids,
+            input_mask=input_mask,
+            token_type_ids=segment_ids,
+            use_one_hot_embeddings=use_one_hot_embeddings)
+
+        # In the demo, we are doing a simple classification task on the entire
+        # segment.
+        #
+        # If you want to use the token-level output, use model.get_sequence_output()
+        # instead.
+        output_layer = model.get_pooled_output()
+
+        hidden_size = output_layer.shape[-1].value
+
+        output_weights = tf.get_variable(
+            "output_weights", [num_labels, hidden_size],
+            initializer=tf.truncated_normal_initializer(stddev=0.02))
+
+        output_bias = tf.get_variable(
+            "output_bias", [num_labels], initializer=tf.zeros_initializer())
+
+        with tf.variable_scope("loss"):
+            if is_training:
+                # I.e., 0.1 dropout
+                output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
+
+            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
+            logits = tf.nn.bias_add(logits, output_bias)
+            probabilities = tf.nn.softmax(logits, axis=-1)
+            log_probs = tf.nn.log_softmax(logits, axis=-1)
+
+            one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
+
+            per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
+            loss = tf.reduce_mean(per_example_loss)
+
+            return (loss, per_example_loss, logits, probabilities)
+
+    def model_fn_builder(self, bert_config, num_labels, init_checkpoint, learning_rate,
+                         num_train_steps, num_warmup_steps,
+                         use_one_hot_embeddings):
+        """Returns `model_fn` closure for TPUEstimator."""
+
+        def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
+            from tensorflow.python.estimator.model_fn import EstimatorSpec
+
+            tf.logging.info("*** Features ***")
+            for name in sorted(features.keys()):
+                tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))
+
+            input_ids = features["input_ids"]
+            input_mask = features["input_mask"]
+            segment_ids = features["segment_ids"]
+            label_ids = features["label_ids"]
+
+            is_training = (mode == tf.estimator.ModeKeys.TRAIN)
+
+            (total_loss, per_example_loss, logits, probabilities) = BertSim.create_model(
+                bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
+                num_labels, use_one_hot_embeddings)
+
+            tvars = tf.trainable_variables()
+            initialized_variable_names = {}
+
+            if init_checkpoint:
+                (assignment_map, initialized_variable_names) \
+                    = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
+                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+
+            tf.logging.info("**** Trainable Variables ****")
+            for var in tvars:
+                init_string = ""
+                if var.name in initialized_variable_names:
+                    init_string = ", *INIT_FROM_CKPT*"
+                tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
+                                init_string)
+
+            if mode == tf.estimator.ModeKeys.TRAIN:
+
+                train_op = optimization.create_optimizer(
+                    total_loss, learning_rate, num_train_steps, num_warmup_steps, False)
+
+                output_spec = EstimatorSpec(
+                    mode=mode,
+                    loss=total_loss,
+                    train_op=train_op)
+            elif mode == tf.estimator.ModeKeys.EVAL:
+
+                def metric_fn(per_example_loss, label_ids, logits):
+                    predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
+                    accuracy = tf.metrics.accuracy(label_ids, predictions)
+                    auc = tf.metrics.auc(label_ids, predictions)
+                    loss = tf.metrics.mean(per_example_loss)
+                    return {
+                        "eval_accuracy": accuracy,
+                        "eval_auc": auc,
+                        "eval_loss": loss,
+                    }
+
+                eval_metrics = metric_fn(per_example_loss, label_ids, logits)
+                output_spec = EstimatorSpec(
+                    mode=mode,
+                    loss=total_loss,
+                    eval_metric_ops=eval_metrics)
+            else:
+                output_spec = EstimatorSpec(mode=mode, predictions=probabilities)
+
+            return output_spec
+
+        return model_fn
+
+    def get_estimator(self):
+
+        from tensorflow.python.estimator.estimator import Estimator
+        from tensorflow.python.estimator.run_config import RunConfig
+
+        bert_config = modeling.BertConfig.from_json_file(args.config_name)
+        label_list = self.processor.get_labels()
+        train_examples = self.processor.get_train_examples(args.data_dir)
+        num_train_steps = int(
+            len(train_examples) / self.batch_size * args.num_train_epochs)
+        num_warmup_steps = int(num_train_steps * 0.1)
+
+        if self.mode == tf.estimator.ModeKeys.TRAIN:
+            init_checkpoint = args.ckpt_name
+        else:
+            init_checkpoint = args.output_dir
+
+        model_fn = self.model_fn_builder(
+            bert_config=bert_config,
+            num_labels=len(label_list),
+            init_checkpoint=init_checkpoint,
+            learning_rate=args.learning_rate,
+            num_train_steps=num_train_steps,
+            num_warmup_steps=num_warmup_steps,
+            use_one_hot_embeddings=False)
+
+        config = tf.ConfigProto()
+        config.gpu_options.allow_growth = True
+        config.gpu_options.per_process_gpu_memory_fraction = args.gpu_memory_fraction
+        config.log_device_placement = False
+
+        return Estimator(model_fn=model_fn, config=RunConfig(session_config=config), model_dir=args.output_dir,
+                         params={'batch_size': self.batch_size})
+
+    def predict_from_queue(self):
+        for i in self.estimator.predict(input_fn=self.queue_predict_input_fn, yield_single_examples=False):
+            self.output_queue.put(i)
+
+    def queue_predict_input_fn(self):
+        return (tf.data.Dataset.from_generator(
+            self.generate_from_queue,
+            output_types={
+                'input_ids': tf.int32,
+                'input_mask': tf.int32,
+                'segment_ids': tf.int32,
+                'label_ids': tf.int32},
+            output_shapes={
+                'input_ids': (None, self.max_seq_length),
+                'input_mask': (None, self.max_seq_length),
+                'segment_ids': (None, self.max_seq_length),
+                'label_ids': (1,)}).prefetch(10))
+
+    def convert_examples_to_features(self, examples, label_list, max_seq_length, tokenizer):
+        """Convert a set of `InputExample`s to a list of `InputFeatures`."""
+
+        for (ex_index, example) in enumerate(examples):
+            label_map = {}
+            for (i, label) in enumerate(label_list):
+                label_map[label] = i
+
+            tokens_a = tokenizer.tokenize(example.text_a)
+            tokens_b = None
+            if example.text_b:
+                tokens_b = tokenizer.tokenize(example.text_b)
+
+            if tokens_b:
+                # Modifies `tokens_a` and `tokens_b` in place so that the total
+                # length is less than the specified length.
+                # Account for [CLS], [SEP], [SEP] with "- 3"
+                self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+            else:
+                # Account for [CLS] and [SEP] with "- 2"
+                if len(tokens_a) > max_seq_length - 2:
+                    tokens_a = tokens_a[0:(max_seq_length - 2)]
+
+            # The convention in BERT is:
+            # (a) For sequence pairs:
+            #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+            #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
+            # (b) For single sequences:
+            #  tokens:   [CLS] the dog is hairy . [SEP]
+            #  type_ids: 0     0   0   0  0     0 0
+            #
+            # Where "type_ids" are used to indicate whether this is the first
+            # sequence or the second sequence. The embedding vectors for `type=0` and
+            # `type=1` were learned during pre-training and are added to the wordpiece
+            # embedding vector (and position vector). This is not *strictly* necessary
+            # since the [SEP] token unambiguously separates the sequences, but it makes
+            # it easier for the model to learn the concept of sequences.
+            #
+            # For classification tasks, the first vector (corresponding to [CLS]) is
+            # used as as the "sentence vector". Note that this only makes sense because
+            # the entire model is fine-tuned.
+            tokens = []
+            segment_ids = []
+            tokens.append("[CLS]")
+            segment_ids.append(0)
+            for token in tokens_a:
+                tokens.append(token)
+                segment_ids.append(0)
+            tokens.append("[SEP]")
+            segment_ids.append(0)
+
+            if tokens_b:
+                for token in tokens_b:
+                    tokens.append(token)
+                    segment_ids.append(1)
+                tokens.append("[SEP]")
+                segment_ids.append(1)
+
+            input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+            # The mask has 1 for real tokens and 0 for padding tokens. Only real
+            # tokens are attended to.
+            input_mask = [1] * len(input_ids)
+
+            # Zero-pad up to the sequence length.
+            while len(input_ids) < max_seq_length:
+                input_ids.append(0)
+                input_mask.append(0)
+                segment_ids.append(0)
+
+            assert len(input_ids) == max_seq_length
+            assert len(input_mask) == max_seq_length
+            assert len(segment_ids) == max_seq_length
+
+            label_id = label_map[example.label]
+            if ex_index < 5:
+                tf.logging.info("*** Example ***")
+                tf.logging.info("guid: %s" % (example.guid))
+                tf.logging.info("tokens: %s" % " ".join(
+                    [tokenization.printable_text(x) for x in tokens]))
+                tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+                tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+                tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+                tf.logging.info("label: %s (id = %d)" % (example.label, label_id))
+
+            feature = InputFeatures(
+                input_ids=input_ids,
+                input_mask=input_mask,
+                segment_ids=segment_ids,
+                label_id=label_id)
+
+            yield feature
+
+    def generate_from_queue(self):
+        while True:
+            predict_examples = self.processor.get_sentence_examples(self.input_queue.get())
+            features = list(self.convert_examples_to_features(predict_examples, self.processor.get_labels(),
+                                                              args.max_seq_len, self.tokenizer))
+            yield {
+                'input_ids': [f.input_ids for f in features],
+                'input_mask': [f.input_mask for f in features],
+                'segment_ids': [f.segment_ids for f in features],
+                'label_ids': [f.label_id for f in features]
+            }
+
+    def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
+        """Truncates a sequence pair in place to the maximum length."""
+
+        # This is a simple heuristic which will always truncate the longer sequence
+        # one token at a time. This makes more sense than truncating an equal percent
+        # of tokens from each, since if one sequence is very short then each token
+        # that's truncated likely contains more information than a longer sequence.
+        while True:
+            total_length = len(tokens_a) + len(tokens_b)
+            if total_length <= max_length:
+                break
+            if len(tokens_a) > len(tokens_b):
+                tokens_a.pop()
+            else:
+                tokens_b.pop()
+
+    def convert_single_example(self, ex_index, example, label_list, max_seq_length, tokenizer):
+        """Converts a single `InputExample` into a single `InputFeatures`."""
+        label_map = {}
+        for (i, label) in enumerate(label_list):
+            label_map[label] = i
+
+        tokens_a = tokenizer.tokenize(example.text_a)
+        tokens_b = None
+        if example.text_b:
+            tokens_b = tokenizer.tokenize(example.text_b)
+
+        if tokens_b:
+            # Modifies `tokens_a` and `tokens_b` in place so that the total
+            # length is less than the specified length.
+            # Account for [CLS], [SEP], [SEP] with "- 3"
+            self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+        else:
+            # Account for [CLS] and [SEP] with "- 2"
+            if len(tokens_a) > max_seq_length - 2:
+                tokens_a = tokens_a[0:(max_seq_length - 2)]
+
+        # The convention in BERT is:
+        # (a) For sequence pairs:
+        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+        #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
+        # (b) For single sequences:
+        #  tokens:   [CLS] the dog is hairy . [SEP]
+        #  type_ids: 0     0   0   0  0     0 0
+        #
+        # Where "type_ids" are used to indicate whether this is the first
+        # sequence or the second sequence. The embedding vectors for `type=0` and
+        # `type=1` were learned during pre-training and are added to the wordpiece
+        # embedding vector (and position vector). This is not *strictly* necessary
+        # since the [SEP] token unambiguously separates the sequences, but it makes
+        # it easier for the model to learn the concept of sequences.
+        #
+        # For classification tasks, the first vector (corresponding to [CLS]) is
+        # used as as the "sentence vector". Note that this only makes sense because
+        # the entire model is fine-tuned.
+        tokens = []
+        segment_ids = []
+        tokens.append("[CLS]")
+        segment_ids.append(0)
+        for token in tokens_a:
+            tokens.append(token)
+            segment_ids.append(0)
+        tokens.append("[SEP]")
+        segment_ids.append(0)
+
+        if tokens_b:
+            for token in tokens_b:
+                tokens.append(token)
+                segment_ids.append(1)
+            tokens.append("[SEP]")
+            segment_ids.append(1)
+
+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
+        # tokens are attended to.
+        input_mask = [1] * len(input_ids)
+
+        # Zero-pad up to the sequence length.
+        while len(input_ids) < max_seq_length:
+            input_ids.append(0)
+            input_mask.append(0)
+            segment_ids.append(0)
+
+        assert len(input_ids) == max_seq_length
+        assert len(input_mask) == max_seq_length
+        assert len(segment_ids) == max_seq_length
+
+        label_id = label_map[example.label]
+        if ex_index < 5:
+            tf.logging.info("*** Example ***")
+            tf.logging.info("guid: %s" % (example.guid))
+            tf.logging.info("tokens: %s" % " ".join(
+                [tokenization.printable_text(x) for x in tokens]))
+            tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+            tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+            tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+            tf.logging.info("label: %s (id = %d)" % (example.label, label_id))
+
+        feature = InputFeatures(
+            input_ids=input_ids,
+            input_mask=input_mask,
+            segment_ids=segment_ids,
+            label_id=label_id)
+        return feature
+
+    def file_based_convert_examples_to_features(self, examples, label_list, max_seq_length, tokenizer, output_file):
+        """Convert a set of `InputExample`s to a TFRecord file."""
+
+        writer = tf.python_io.TFRecordWriter(output_file)
+
+        for (ex_index, example) in enumerate(examples):
+            if ex_index % 10000 == 0:
+                tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
+
+            feature = self.convert_single_example(ex_index, example, label_list,
+                                                  max_seq_length, tokenizer)
+
+            def create_int_feature(values):
+                f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+                return f
+
+            features = collections.OrderedDict()
+            features["input_ids"] = create_int_feature(feature.input_ids)
+            features["input_mask"] = create_int_feature(feature.input_mask)
+            features["segment_ids"] = create_int_feature(feature.segment_ids)
+            features["label_ids"] = create_int_feature([feature.label_id])
+
+            tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+            writer.write(tf_example.SerializeToString())
+
+    def file_based_input_fn_builder(self, input_file, seq_length, is_training, drop_remainder):
+        """Creates an `input_fn` closure to be passed to TPUEstimator."""
+
+        name_to_features = {
+            "input_ids": tf.FixedLenFeature([seq_length], tf.int64),
+            "input_mask": tf.FixedLenFeature([seq_length], tf.int64),
+            "segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
+            "label_ids": tf.FixedLenFeature([], tf.int64),
+        }
+
+        def _decode_record(record, name_to_features):
+            """Decodes a record to a TensorFlow example."""
+            example = tf.parse_single_example(record, name_to_features)
+
+            # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+            # So cast all int64 to int32.
+            for name in list(example.keys()):
+                t = example[name]
+                if t.dtype == tf.int64:
+                    t = tf.to_int32(t)
+                example[name] = t
+
+            return example
+
+        def input_fn(params):
+            """The actual input function."""
+            batch_size = params["batch_size"]
+
+            # For training, we want a lot of parallel reading and shuffling.
+            # For eval, we want no shuffling and parallel reading doesn't matter.
+            d = tf.data.TFRecordDataset(input_file)
+            if is_training:
+                d = d.repeat()
+                d = d.shuffle(buffer_size=100)
+
+            d = d.apply(
+                tf.contrib.data.map_and_batch(
+                    lambda record: _decode_record(record, name_to_features),
+                    batch_size=batch_size,
+                    drop_remainder=drop_remainder))
+
+            return d
+
+        return input_fn
+
+    def train(self):
+        if self.mode is None:
+            raise ValueError("Please set the 'mode' parameter")
+
+        bert_config = modeling.BertConfig.from_json_file(args.config_name)
+
+        if args.max_seq_len > bert_config.max_position_embeddings:
+            raise ValueError(
+                "Cannot use sequence length %d because the BERT model "
+                "was only trained up to sequence length %d" %
+                (args.max_seq_len, bert_config.max_position_embeddings))
+
+        tf.gfile.MakeDirs(args.output_dir)
+
+        label_list = self.processor.get_labels()
+
+        train_examples = self.processor.get_train_examples(args.data_dir)
+        num_train_steps = int(len(train_examples) / args.batch_size * args.num_train_epochs)
+
+        estimator = self.get_estimator()
+
+        train_file = os.path.join(args.output_dir, "train.tf_record")
+        self.file_based_convert_examples_to_features(train_examples, label_list, args.max_seq_len, self.tokenizer,
+                                                     train_file)
+        tf.logging.info("***** Running training *****")
+        tf.logging.info("  Num examples = %d", len(train_examples))
+        tf.logging.info("  Batch size = %d", args.batch_size)
+        tf.logging.info("  Num steps = %d", num_train_steps)
+        train_input_fn = self.file_based_input_fn_builder(input_file=train_file, seq_length=args.max_seq_len,
+                                                          is_training=True,
+                                                          drop_remainder=True)
+
+        # early_stopping = tf.contrib.estimator.stop_if_no_decrease_hook(
+        #     estimator,
+        #     metric_name='loss',
+        #     max_steps_without_decrease=10,
+        #     min_steps=num_train_steps)
+
+        # estimator.train(input_fn=train_input_fn, hooks=[early_stopping])
+        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
+
+    def eval(self):
+        if self.mode is None:
+            raise ValueError("Please set the 'mode' parameter")
+        eval_examples = self.processor.get_dev_examples(args.data_dir)
+        eval_file = os.path.join(args.output_dir, "eval.tf_record")
+        label_list = self.processor.get_labels()
+        self.file_based_convert_examples_to_features(
+            eval_examples, label_list, args.max_seq_len, self.tokenizer, eval_file)
+
+        tf.logging.info("***** Running evaluation *****")
+        tf.logging.info("  Num examples = %d", len(eval_examples))
+        tf.logging.info("  Batch size = %d", self.batch_size)
+
+        eval_input_fn = self.file_based_input_fn_builder(
+            input_file=eval_file,
+            seq_length=args.max_seq_len,
+            is_training=False,
+            drop_remainder=False)
+
+        estimator = self.get_estimator()
+        result = estimator.evaluate(input_fn=eval_input_fn, steps=None)
+
+        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
+        with tf.gfile.GFile(output_eval_file, "w") as writer:
+            tf.logging.info("***** Eval results *****")
+            for key in sorted(result.keys()):
+                tf.logging.info("  %s = %s", key, str(result[key]))
+                writer.write("%s = %s\n" % (key, str(result[key])))
+
+    def predict(self, sentence1, sentence2):
+        if self.mode is None:
+            raise ValueError("Please set the 'mode' parameter")
+        self.input_queue.put([(sentence1, sentence2)])
+        prediction = self.output_queue.get()
+        return prediction
+
+
+if __name__ == '__main__':
+    sim = BertSim()
+    sim.set_mode(tf.estimator.ModeKeys.TRAIN)
+    sim.train()
+    sim.set_mode(tf.estimator.ModeKeys.EVAL)
+    sim.eval()
+    # sim.set_mode(tf.estimator.ModeKeys.PREDICT)
+    # while True:
+    #     sentence1 = input('sentence1: ')
+    #     sentence2 = input('sentence2: ')
+    #     predict = sim.predict(sentence1, sentence2)
+    #     print(f'similarity：{predict[0][1]}')
--- a/tokenization.py
+++ b/tokenization.py
@ -0,0 +1,348 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import unicodedata
+import six
+import tensorflow as tf
+
+
+def convert_to_unicode(text):
+  """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+  if six.PY3:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, bytes):
+      return text.decode("utf-8", "ignore")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  elif six.PY2:
+    if isinstance(text, str):
+      return text.decode("utf-8", "ignore")
+    elif isinstance(text, unicode):
+      return text
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  else:
+    raise ValueError("Not running on Python2 or Python 3?")
+
+
+def printable_text(text):
+  """Returns text encoded in a way suitable for print or `tf.logging`."""
+
+  # These functions want `str` for both Python2 and Python3, but in one case
+  # it's a Unicode string and in the other it's a byte string.
+  if six.PY3:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, bytes):
+      return text.decode("utf-8", "ignore")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  elif six.PY2:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, unicode):
+      return text.encode("utf-8")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  else:
+    raise ValueError("Not running on Python2 or Python 3?")
+
+
+def load_vocab(vocab_file):
+  """Loads a vocabulary file into a dictionary."""
+  vocab = collections.OrderedDict()
+  index = 0
+  with tf.gfile.GFile(vocab_file, "r") as reader:
+    while True:
+      token = convert_to_unicode(reader.readline())
+      if not token:
+        break
+      token = token.strip()
+      vocab[token] = index
+      index += 1
+  return vocab
+
+
+def convert_by_vocab(vocab, items):
+  """Converts a sequence of [tokens|ids] using the vocab."""
+  output = []
+  for item in items:
+    output.append(vocab[item])
+  return output
+
+
+def convert_tokens_to_ids(vocab, tokens):
+  return convert_by_vocab(vocab, tokens)
+
+
+def convert_ids_to_tokens(inv_vocab, ids):
+  return convert_by_vocab(inv_vocab, ids)
+
+
+def whitespace_tokenize(text):
+  """Runs basic whitespace cleaning and splitting on a piece of text."""
+  text = text.strip()
+  if not text:
+    return []
+  tokens = text.split()
+  return tokens
+
+
+class FullTokenizer(object):
+  """Runs end-to-end tokenziation."""
+
+  def __init__(self, vocab_file, do_lower_case=True):
+    self.vocab = load_vocab(vocab_file)
+    self.inv_vocab = {v: k for k, v in self.vocab.items()}
+    self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+    self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+  def tokenize(self, text):
+    split_tokens = []
+    for token in self.basic_tokenizer.tokenize(text):
+      for sub_token in self.wordpiece_tokenizer.tokenize(token):
+        split_tokens.append(sub_token)
+
+    return split_tokens
+
+  def convert_tokens_to_ids(self, tokens):
+    return convert_by_vocab(self.vocab, tokens)
+
+  def convert_ids_to_tokens(self, ids):
+    return convert_by_vocab(self.inv_vocab, ids)
+
+
+class BasicTokenizer(object):
+  """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+  def __init__(self, do_lower_case=True):
+    """Constructs a BasicTokenizer.
+
+    Args:
+      do_lower_case: Whether to lower case the input.
+    """
+    self.do_lower_case = do_lower_case
+
+  def tokenize(self, text):
+    """Tokenizes a piece of text."""
+    text = convert_to_unicode(text)
+    text = self._clean_text(text)
+
+    # This was added on November 1st, 2018 for the multilingual and Chinese
+    # models. This is also applied to the English models now, but it doesn't
+    # matter since the English models were not trained on any Chinese data
+    # and generally don't have any Chinese data in them (there are Chinese
+    # characters in the vocabulary because Wikipedia does have some Chinese
+    # words in the English Wikipedia.).
+    text = self._tokenize_chinese_chars(text)
+
+    orig_tokens = whitespace_tokenize(text)
+    split_tokens = []
+    for token in orig_tokens:
+      if self.do_lower_case:
+        token = token.lower()
+        token = self._run_strip_accents(token)
+      split_tokens.extend(self._run_split_on_punc(token))
+
+    output_tokens = whitespace_tokenize(" ".join(split_tokens))
+    return output_tokens
+
+  def _run_strip_accents(self, text):
+    """Strips accents from a piece of text."""
+    text = unicodedata.normalize("NFD", text)
+    output = []
+    for char in text:
+      cat = unicodedata.category(char)
+      if cat == "Mn":
+        continue
+      output.append(char)
+    return "".join(output)
+
+  def _run_split_on_punc(self, text):
+    """Splits punctuation on a piece of text."""
+    chars = list(text)
+    i = 0
+    start_new_word = True
+    output = []
+    while i < len(chars):
+      char = chars[i]
+      if _is_punctuation(char):
+        output.append([char])
+        start_new_word = True
+      else:
+        if start_new_word:
+          output.append([])
+        start_new_word = False
+        output[-1].append(char)
+      i += 1
+
+    return ["".join(x) for x in output]
+
+  def _tokenize_chinese_chars(self, text):
+    """Adds whitespace around any CJK character."""
+    output = []
+    for char in text:
+      cp = ord(char)
+      if self._is_chinese_char(cp):
+        output.append(" ")
+        output.append(char)
+        output.append(" ")
+      else:
+        output.append(char)
+    return "".join(output)
+
+  def _is_chinese_char(self, cp):
+    """Checks whether CP is the codepoint of a CJK character."""
+    # This defines a "chinese character" as anything in the CJK Unicode block:
+    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+    #
+    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+    # despite its name. The modern Korean Hangul alphabet is a different block,
+    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+    # space-separated words, so they are not treated specially and handled
+    # like the all of the other languages.
+    if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+        (cp >= 0x3400 and cp <= 0x4DBF) or  #
+        (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+        (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+        (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+        (cp >= 0x2B820 and cp <= 0x2CEAF) or
+        (cp >= 0xF900 and cp <= 0xFAFF) or  #
+        (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+      return True
+
+    return False
+
+  def _clean_text(self, text):
+    """Performs invalid character removal and whitespace cleanup on text."""
+    output = []
+    for char in text:
+      cp = ord(char)
+      if cp == 0 or cp == 0xfffd or _is_control(char):
+        continue
+      if _is_whitespace(char):
+        output.append(" ")
+      else:
+        output.append(char)
+    return "".join(output)
+
+
+class WordpieceTokenizer(object):
+  """Runs WordPiece tokenziation."""
+
+  def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
+    self.vocab = vocab
+    self.unk_token = unk_token
+    self.max_input_chars_per_word = max_input_chars_per_word
+
+  def tokenize(self, text):
+    """Tokenizes a piece of text into its word pieces.
+
+    This uses a greedy longest-match-first algorithm to perform tokenization
+    using the given vocabulary.
+
+    For example:
+      input = "unaffable"
+      output = ["un", "##aff", "##able"]
+
+    Args:
+      text: A single token or whitespace separated tokens. This should have
+        already been passed through `BasicTokenizer.
+
+    Returns:
+      A list of wordpiece tokens.
+    """
+
+    text = convert_to_unicode(text)
+
+    output_tokens = []
+    for token in whitespace_tokenize(text):
+      chars = list(token)
+      if len(chars) > self.max_input_chars_per_word:
+        output_tokens.append(self.unk_token)
+        continue
+
+      is_bad = False
+      start = 0
+      sub_tokens = []
+      while start < len(chars):
+        end = len(chars)
+        cur_substr = None
+        while start < end:
+          substr = "".join(chars[start:end])
+          if start > 0:
+            substr = "##" + substr
+          if substr in self.vocab:
+            cur_substr = substr
+            break
+          end -= 1
+        if cur_substr is None:
+          is_bad = True
+          break
+        sub_tokens.append(cur_substr)
+        start = end
+
+      if is_bad:
+        output_tokens.append(self.unk_token)
+      else:
+        output_tokens.extend(sub_tokens)
+    return output_tokens
+
+
+def _is_whitespace(char):
+  """Checks whether `chars` is a whitespace character."""
+  # \t, \n, and \r are technically contorl characters but we treat them
+  # as whitespace since they are generally considered as such.
+  if char == " " or char == "\t" or char == "\n" or char == "\r":
+    return True
+  cat = unicodedata.category(char)
+  if cat == "Zs":
+    return True
+  return False
+
+
+def _is_control(char):
+  """Checks whether `chars` is a control character."""
+  # These are technically control characters but we count them as whitespace
+  # characters.
+  if char == "\t" or char == "\n" or char == "\r":
+    return False
+  cat = unicodedata.category(char)
+  if cat.startswith("C"):
+    return True
+  return False
+
+
+def _is_punctuation(char):
+  """Checks whether `chars` is a punctuation character."""
+  cp = ord(char)
+  # We treat all non-letter/number ASCII as punctuation.
+  # Characters such as "^", "$", and "`" are not in the Unicode
+  # Punctuation class but we treat them as punctuation anyways, for
+  # consistency.
+  if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+      (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+    return True
+  cat = unicodedata.category(char)
+  if cat.startswith("P"):
+    return True
+  return False