from tensorflow.keras.layers import TextVectorization
text_vectorization = TextVectorization(
    output_mode="int",
    )


import re
import string
import tensorflow as tf

# 표준화: 소문자화 및 마침표 제거
def custom_standardization_fn(string_tensor):
    lowercase_string = tf.strings.lower(string_tensor)
    return tf.strings.regex_replace(
        lowercase_string, f"[{re.escape(string.punctuation)}]", "")

# 공백 기준으로 쪼개기
def custom_split_fn(string_tensor):
    return tf.strings.split(string_tensor)

# 사용자 정의 표준화 및 쪼개기 활용
text_vectorization = TextVectorization(
    output_mode="int",
    standardize=custom_standardization_fn,
    split=custom_split_fn,
)


dataset = [
    "I write, erase, rewrite",
    "Erase again, and then",
    "A poppy blooms.",
]


text_vectorization.adapt(dataset)


vocabulary = text_vectorization.get_vocabulary()
vocabulary

['',
 '[UNK]',
 'erase',
 'write',
 'then',
 'rewrite',
 'poppy',
 'i',
 'blooms',
 'and',
 'again',
 'a']


test_sentence = "I write, rewrite, and still rewrite again"


encoded_sentence = text_vectorization(test_sentence)
print(encoded_sentence)

tf.Tensor([ 7  3  5  9  1  5 10], shape=(7,), dtype=int64)


inverse_vocab = dict(enumerate(vocabulary))

decoded_sentence = " ".join(inverse_vocab[int(i)] for i in encoded_sentence)
print(decoded_sentence)

i write rewrite and [UNK] rewrite again


!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  26.1M      0  0:00:03  0:00:03 --:--:-- 26.1M


if 'google.colab' in str(get_ipython()):
    !rm -r aclImdb/train/unsup
else: 
    import shutil
    unsup_path = './aclImdb/train/unsup'
    shutil.rmtree(unsup_path)


if 'google.colab' in str(get_ipython()):
    !cat aclImdb/train/pos/4077_10.txt
else:
    with open('aclImdb/train/pos/4077_10.txt', 'r') as f:
        text = f.read()
        print(text)

I first saw this back in the early 90s on UK TV, i did like it then but i missed the chance to tape it, many years passed but the film always stuck with me and i lost hope of seeing it TV again, the main thing that stuck with me was the end, the hole castle part really touched me, its easy to watch, has a great story, great music, the list goes on and on, its OK me saying how good it is but everyone will take there own best bits away with them once they have seen it, yes the animation is top notch and beautiful to watch, it does show its age in a very few parts but that has now become part of it beauty, i am so glad it has came out on DVD as it is one of my top 10 films of all time. Buy it or rent it just see it, best viewing is at night alone with drink and food in reach so you don't have to stop the film.<br /><br />Enjoy


import os, pathlib, shutil, random

base_dir = pathlib.Path("aclImdb")
val_dir = base_dir / "val"
train_dir = base_dir / "train"

for category in ("neg", "pos"):
    os.makedirs(val_dir / category)            # val 디렉토리 생성
    files = os.listdir(train_dir / category)
    
    random.Random(1337).shuffle(files)         # 훈련셋 무작위 섞기
    
    num_val_samples = int(0.2 * len(files))    # 20% 지정 후 검증셋으로 옮기기
    val_files = files[-num_val_samples:]
    
    for fname in val_files:
        shutil.move(train_dir / category / fname,
                    val_dir / category / fname)


from tensorflow import keras

batch_size = 32

train_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/train", batch_size=batch_size
    )

val_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/val", batch_size=batch_size
    )

test_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/test", batch_size=batch_size
    )

Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


for inputs, targets in train_ds:
    print("inputs.shape:", inputs.shape)
    print("inputs.dtype:", inputs.dtype)
    print("targets.shape:", targets.shape)
    print("targets.dtype:", targets.dtype)
    
    # 예제: 첫째 배치의 첫째 리뷰
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets[0])
    
    break

inputs.shape: (32,)
inputs.dtype: <dtype: 'string'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor(b'The film begins with a bunch of kids in reform school and focuses on a kid named \'Gabe\', who has apparently worked hard to earn his parole. Gabe and his sister move to a new neighborhood to make a fresh start and soon Gabe meets up with the Dead End Kids. The Kids in this film are little punks, but they are much less antisocial than they\'d been in other previous films and down deep, they are well-meaning punks. However, in this neighborhood there are also some criminals who are perpetrating insurance fraud through arson and see Gabe as a convenient scapegoat--after all, he\'d been to reform school and no one would believe he was innocent once he was framed. So, when Gabe is about ready to be sent back to "The Big House", it\'s up to the rest of the gang to save him and expose the real crooks.<br /><br />The "Dead End Kids" appeared in several Warner Brothers films in the late 1930s and the films were generally very good (particularly ANGELS WITH DIRTY FACES). However, after the boys\' contracts expired, they went on to Monogram Studios and the films, to put it charitably, were very weak and formulaic--with Huntz Hall and Leo Gorcey being pretty much the whole show and the group being renamed "The Bowery Boys". Because ANGELS WASH THEIR FACES had the excellent writing and production values AND Hall and Gorcey were not constantly mugging for the camera, it\'s a pretty good film--and almost earns a score of 7 (it\'s REAL close). In fact, while this isn\'t a great film aesthetically, it\'s sure a lot of fun to watch, so I will give it a 7! Sure, it was a tad hokey-particularly towards the end when the kids take the law into their own hands and Reagan ignores the Bill of Rights--but it was also quite entertaining. The Dead End Kids are doing their best performances and Ronald Reagan and Ann Sheridan provided excellent support. Sure, this part of the film was illogical and impossible but somehow it was still funny and rather charming--so if you can suspend disbelief, it works well.', shape=(), dtype=string)
targets[0]: tf.Tensor(1, shape=(), dtype=int32)


from tensorflow.keras.layers import TextVectorization

text_vectorization = TextVectorization(
    max_tokens=20000,
    output_mode="multi_hot",
)

# 어휘색인 생성
text_only_train_ds = train_ds.map(lambda x, y: x)
text_vectorization.adapt(text_only_train_ds)


binary_1gram_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y))
binary_1gram_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y))
binary_1gram_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y))


for inputs, targets in binary_1gram_train_ds:
    print("inputs.shape:", inputs.shape)
    print("inputs.dtype:", inputs.dtype)
    print("targets.shape:", targets.shape)
    print("targets.dtype:", targets.dtype)
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets[0])
    break

inputs.shape: (32, 20000)
inputs.dtype: <dtype: 'float32'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor([1. 1. 1. ... 0. 0. 0.], shape=(20000,), dtype=float32)
targets[0]: tf.Tensor(0, shape=(), dtype=int32)


from tensorflow import keras
from tensorflow.keras import layers

def get_model(max_tokens=20000, hidden_dim=16):
    inputs = keras.Input(shape=(max_tokens,))
    x = layers.Dense(hidden_dim, activation="relu")(inputs)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)  # 긍정일 확률 계산
    
    model = keras.Model(inputs, outputs)
    
    model.compile(optimizer="rmsprop",
                  loss="binary_crossentropy",
                  metrics=["accuracy"])
    
    return model


model = get_model()
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_1 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense (Dense)               (None, 16)                320016    
                                                                 
 dropout (Dropout)           (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
=================================================================
Total params: 320,033
Trainable params: 320,033
Non-trainable params: 0
_________________________________________________________________


callbacks = [
    keras.callbacks.ModelCheckpoint("binary_1gram.keras",
                                    save_best_only=True)
]

model.fit(binary_1gram_train_ds.cache(),
          validation_data=binary_1gram_val_ds.cache(),
          epochs=10,
          callbacks=callbacks)

model = keras.models.load_model("binary_1gram.keras")
print(f"Test acc: {model.evaluate(binary_1gram_test_ds)[1]:.3f}")

Epoch 1/10
625/625 [==============================] - 10s 16ms/step - loss: 0.4074 - accuracy: 0.8277 - val_loss: 0.2792 - val_accuracy: 0.8908
Epoch 2/10
625/625 [==============================] - 3s 5ms/step - loss: 0.2746 - accuracy: 0.8981 - val_loss: 0.2774 - val_accuracy: 0.8964
Epoch 3/10
625/625 [==============================] - 4s 6ms/step - loss: 0.2471 - accuracy: 0.9115 - val_loss: 0.2872 - val_accuracy: 0.8976
Epoch 4/10
625/625 [==============================] - 3s 5ms/step - loss: 0.2246 - accuracy: 0.9244 - val_loss: 0.3187 - val_accuracy: 0.8936
Epoch 5/10
625/625 [==============================] - 3s 6ms/step - loss: 0.2156 - accuracy: 0.9313 - val_loss: 0.3164 - val_accuracy: 0.8960
Epoch 6/10
625/625 [==============================] - 3s 6ms/step - loss: 0.2108 - accuracy: 0.9341 - val_loss: 0.3355 - val_accuracy: 0.8934
Epoch 7/10
625/625 [==============================] - 4s 6ms/step - loss: 0.2052 - accuracy: 0.9366 - val_loss: 0.3354 - val_accuracy: 0.8944
Epoch 8/10
625/625 [==============================] - 4s 6ms/step - loss: 0.2017 - accuracy: 0.9365 - val_loss: 0.3582 - val_accuracy: 0.8940
Epoch 9/10
625/625 [==============================] - 4s 6ms/step - loss: 0.2013 - accuracy: 0.9394 - val_loss: 0.3497 - val_accuracy: 0.8938
Epoch 10/10
625/625 [==============================] - 4s 6ms/step - loss: 0.2043 - accuracy: 0.9394 - val_loss: 0.3631 - val_accuracy: 0.8940
782/782 [==============================] - 8s 10ms/step - loss: 0.2861 - accuracy: 0.8885
Test acc: 0.888


text_vectorization = TextVectorization(
    ngrams=2,
    max_tokens=20000,
    output_mode="multi_hot",
)


text_vectorization.adapt(text_only_train_ds)

binary_2gram_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y))
binary_2gram_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y))
binary_2gram_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y))


model = get_model()

callbacks = [
    keras.callbacks.ModelCheckpoint("binary_2gram.keras",
                                    save_best_only=True)
]

model.fit(binary_2gram_train_ds.cache(),
          validation_data=binary_2gram_val_ds.cache(),
          epochs=10,
          callbacks=callbacks)

model = keras.models.load_model("binary_2gram.keras")
print(f"Test acc: {model.evaluate(binary_2gram_test_ds)[1]:.3f}")

Epoch 1/10
625/625 [==============================] - 12s 18ms/step - loss: 0.3857 - accuracy: 0.8347 - val_loss: 0.2791 - val_accuracy: 0.9000
Epoch 2/10
625/625 [==============================] - 4s 6ms/step - loss: 0.2592 - accuracy: 0.9082 - val_loss: 0.2947 - val_accuracy: 0.8988
Epoch 3/10
625/625 [==============================] - 4s 6ms/step - loss: 0.2277 - accuracy: 0.9241 - val_loss: 0.3060 - val_accuracy: 0.8978
Epoch 4/10
625/625 [==============================] - 4s 6ms/step - loss: 0.2074 - accuracy: 0.9333 - val_loss: 0.3417 - val_accuracy: 0.8994
Epoch 5/10
625/625 [==============================] - 4s 6ms/step - loss: 0.2070 - accuracy: 0.9365 - val_loss: 0.3538 - val_accuracy: 0.8968
Epoch 6/10
625/625 [==============================] - 4s 6ms/step - loss: 0.1997 - accuracy: 0.9395 - val_loss: 0.3908 - val_accuracy: 0.8946
Epoch 7/10
625/625 [==============================] - 4s 6ms/step - loss: 0.1940 - accuracy: 0.9421 - val_loss: 0.3715 - val_accuracy: 0.8940
Epoch 8/10
625/625 [==============================] - 4s 6ms/step - loss: 0.1902 - accuracy: 0.9427 - val_loss: 0.4054 - val_accuracy: 0.8930
Epoch 9/10
625/625 [==============================] - 4s 6ms/step - loss: 0.1952 - accuracy: 0.9432 - val_loss: 0.3848 - val_accuracy: 0.8880
Epoch 10/10
625/625 [==============================] - 4s 6ms/step - loss: 0.1949 - accuracy: 0.9441 - val_loss: 0.4011 - val_accuracy: 0.8912
782/782 [==============================] - 9s 11ms/step - loss: 0.2788 - accuracy: 0.8953
Test acc: 0.895


text_vectorization = TextVectorization(
    ngrams=2,
    max_tokens=20000,
    output_mode="count"
)


text_vectorization = TextVectorization(
    ngrams=2,
    max_tokens=20000,
    output_mode="tf_idf",
)


text_vectorization.adapt(text_only_train_ds)

tfidf_2gram_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y))
tfidf_2gram_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y))
tfidf_2gram_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y))

model = get_model()
model.summary()
callbacks = [
    keras.callbacks.ModelCheckpoint("tfidf_2gram.keras",
                                    save_best_only=True)
]
model.fit(tfidf_2gram_train_ds.cache(),
          validation_data=tfidf_2gram_val_ds.cache(),
          epochs=10,
          callbacks=callbacks)

model = keras.models.load_model("tfidf_2gram.keras")
print(f"Test acc: {model.evaluate(tfidf_2gram_test_ds)[1]:.3f}")

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_3 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense_4 (Dense)             (None, 16)                320016    
                                                                 
 dropout_2 (Dropout)         (None, 16)                0         
                                                                 
 dense_5 (Dense)             (None, 1)                 17        
                                                                 
=================================================================
Total params: 320,033
Trainable params: 320,033
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
625/625 [==============================] - 11s 17ms/step - loss: 0.5232 - accuracy: 0.7588 - val_loss: 0.3197 - val_accuracy: 0.8806
Epoch 2/10
625/625 [==============================] - 4s 6ms/step - loss: 0.3534 - accuracy: 0.8442 - val_loss: 0.2946 - val_accuracy: 0.8954
Epoch 3/10
625/625 [==============================] - 4s 6ms/step - loss: 0.3231 - accuracy: 0.8609 - val_loss: 0.3086 - val_accuracy: 0.8864
Epoch 4/10
625/625 [==============================] - 4s 6ms/step - loss: 0.3053 - accuracy: 0.8734 - val_loss: 0.3087 - val_accuracy: 0.8814
Epoch 5/10
625/625 [==============================] - 4s 6ms/step - loss: 0.2781 - accuracy: 0.8845 - val_loss: 0.3225 - val_accuracy: 0.8878
Epoch 6/10
625/625 [==============================] - 4s 6ms/step - loss: 0.2703 - accuracy: 0.8870 - val_loss: 0.3472 - val_accuracy: 0.8702
Epoch 7/10
625/625 [==============================] - 4s 6ms/step - loss: 0.2695 - accuracy: 0.8883 - val_loss: 0.3357 - val_accuracy: 0.8682
Epoch 8/10
625/625 [==============================] - 4s 6ms/step - loss: 0.2650 - accuracy: 0.8931 - val_loss: 0.3343 - val_accuracy: 0.8664
Epoch 9/10
625/625 [==============================] - 4s 6ms/step - loss: 0.2606 - accuracy: 0.8901 - val_loss: 0.3546 - val_accuracy: 0.8580
Epoch 10/10
625/625 [==============================] - 4s 6ms/step - loss: 0.2575 - accuracy: 0.8924 - val_loss: 0.3318 - val_accuracy: 0.8760
782/782 [==============================] - 8s 10ms/step - loss: 0.2998 - accuracy: 0.8927
Test acc: 0.893


inputs = keras.Input(shape=(1,), dtype="string")
# 텍스트 벡터화 추가
processed_inputs = text_vectorization(inputs)
# 훈련된 모델에 적용
outputs = model(processed_inputs)

# 최종 모델
inference_model = keras.Model(inputs, outputs)


import tensorflow as tf

raw_text_data = tf.convert_to_tensor([
    ["That was an excellent movie, I loved it."],
])

predictions = inference_model(raw_text_data)
print(f"{float(predictions[0] * 100):.2f} percent positive")

92.10 percent positive

11장 자연어처리 1부¶

주요내용¶

11.1 자연어처리 소개¶

11.2 텍스트 벡터화¶

11.3 단어 모음 표현법: 집합과 시퀀스¶

11.3.1 IMDB 영화 리뷰 데이터 준비¶

11.3.2 단어주머니 기법¶