NLP_Project/data_loader.py at master · Reid-Cui7/NLP_Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import pandas as pd
import numpy as np
import jieba
import torch
from torch.utils.data import DataLoader
from functools import reduce
from collections import Counter
from embedding import get_embedding
from icecream import ic

def add_with_print(all_corpus):
    # 程序某一步骤运行很慢时, 可以在那个步骤上加入一个装饰器, 让你知道这一步运行到哪里了, 到底有没有运行
    add_with_print.i = 0

    def _warp(a, b):
        add_with_print.i += 1
        if add_with_print.i % 5000 == 0:
            print('{}/{}'.format(add_with_print.i, len(all_corpus)), end=' ')
        return a + b

    return _warp

def get_all_vocabulary(train_file_path, vocab_size):
    CUT, SENTENCE = 'cut', 'sentence'

    corpus = pd.read_csv(train_file_path)
    corpus[CUT] = corpus[SENTENCE].apply(lambda s: ' '.join(list(jieba.cut(s))))
    sentence_counters = map(Counter, map(lambda s: s.split(), corpus[CUT].values))
    chose_words = reduce(add_with_print(corpus), sentence_counters).most_common(vocab_size)

    return [w for w, _ in chose_words]

def tokenizer(sentence, vocab: dict):
    UNK = 1
    ids = [vocab.get(word, UNK) for word in jieba.cut(sentence)]

    return ids

def get_train_data(train_file, vocab2ids):
    val_ratio = 0.2
    content = pd.read_csv(train_file)
    num_val = int(len(content) * val_ratio)

    LABEL, SENTENCE = 'label', 'sentence'

    labels = content[LABEL].values
    content['input_ids'] = content[SENTENCE].apply(lambda s: ' '.join([str(id_) for id_ in tokenizer(s, vocab2ids)]))
    sentence_ids = np.array([[int(id_) for id_ in v.split()] for v in content['input_ids'].values])

    ids = np.random.choice(range(len(content)), size=len(content))
    # shuffle ids

    train_ids = ids[num_val:]
    val_ids = ids[:num_val]

    X_train, y_train = sentence_ids[train_ids], labels[train_ids]
    X_val, y_val = sentence_ids[val_ids], labels[val_ids]

    label2id = {label: i for i, label in enumerate(np.unique(y_train))}
    id2label = {i: label for label, i in label2id.items()}
    y_train = torch.tensor([label2id[y] for y in y_train], dtype=torch.long)
    y_val = torch.tensor([label2id[y] for y in y_val], dtype=torch.long)

    return X_train, y_train, X_val, y_val, label2id, id2label

def build_dataloader(X_train, y_train, X_val, y_val, batch_size):
    train_dataloader = DataLoader([(x, y) for x, y in zip(X_train, y_train)], batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader([(x, y) for x, y in zip(X_val, y_val)], batch_size=batch_size, shuffle=True)

    return train_dataloader, val_dataloader


if __name__ == "__main__":
    # vocab_size = 10000
    # vocabulary = get_all_vocabulary(train_file_path='dataset/train.csv', vocab_size=vocab_size)
    # assert isinstance(vocabulary, list)
    # assert isinstance(vocabulary[0], str)
    # assert len(vocabulary) <= vocab_size

    # with open('dataset/vocabulary.txt', 'w', encoding='utf-8') as f:
    #     for word in vocabulary:
    #         f.write(word + '\n')
    # print('write vocabulary finished!')
    f = open('dataset/vocabulary.txt', 'r', encoding='utf-8')
    vocabulary = f.readlines()
    vocabulary = [v.strip() for v in vocabulary]

    embedding, token2id, vocab_size = get_embedding(set(vocabulary))

    X_train, y_train, X_val, y_val, label2id, id2label = get_train_data('dataset/train.csv', vocab2ids=token2id)

    print(X_train, y_train, X_val, y_val, label2id, id2label)

    train_loader, val_loader = build_dataloader(X_train, y_train, X_val, y_val, batch_size=32)

    for i, (x, y) in enumerate(train_loader):
        ic(x)
        ic(y)
        if i > 5:
            break