Skip to content

german_ner

get_sentences_and_labels

Combines tokens into sentences and create vocab set for train data and labels.

For simplicity tokens with 'O' entity are omitted.

Parameters:

Name Type Description Default
path str

Path to the downloaded dataset file.

required

Returns:

Type Description
Tuple[List[str], List[str], Set[str], Set[str]]

(sentences, labels, train_vocab, label_vocab)

Source code in fastestimator\fastestimator\dataset\data\german_ner.py
def get_sentences_and_labels(path: str) -> Tuple[List[str], List[str], Set[str], Set[str]]:
    """Combines tokens into sentences and create vocab set for train data and labels.

    For simplicity tokens with 'O' entity are omitted.

    Args:
        path: Path to the downloaded dataset file.

    Returns:
        (sentences, labels, train_vocab, label_vocab)
    """
    words, tags = [], []
    word_vocab, label_vocab = set(), set()
    sentences, labels = [], []
    data = open(path)
    for line in data:
        if line[0] != '#':
            line = line.split()
            if len(line) > 2 and line[2] != 'O':
                words.append(line[1])
                tags.append(line[2])
                word_vocab.add(line[1])
                label_vocab.add(line[2])
            else:
                sentences.append(" ".join([s for s in words]))
                labels.append([t for t in tags])
                words.clear()
                tags.clear()
    sentences = list(filter(None, sentences))
    labels = list(filter(None, labels))
    return sentences[:10000], labels[:10000], word_vocab, label_vocab

load_data

Load and return the GermEval dataset.

Dataset from GermEval 2014 contains 31,000 sentences corresponding to over 590,000 tokens from German wikipedia and News corpora. The sentence is encoded as one token per line with information provided in tab-seprated columns. Sourced from https://sites.google.com/site/germeval2014ner/data

Parameters:

Name Type Description Default
root_dir Optional[str]

The path to store the downloaded data. When path is not provided, the data will be saved into fastestimator_data under the user's home directory.

None

Returns:

Type Description
Tuple[NumpyDataset, NumpyDataset, Set[str], Set[str]]

(train_data, eval_data, train_vocab, label_vocab)

Source code in fastestimator\fastestimator\dataset\data\german_ner.py
def load_data(root_dir: Optional[str] = None) -> Tuple[NumpyDataset, NumpyDataset, Set[str], Set[str]]:
    """Load and return the GermEval dataset.

    Dataset from GermEval 2014 contains 31,000 sentences corresponding to over 590,000 tokens from German wikipedia
    and News corpora. The sentence is encoded as one token per line with information provided in tab-seprated columns.
    Sourced from https://sites.google.com/site/germeval2014ner/data

    Args:
        root_dir: The path to store the downloaded data. When `path` is not provided, the data will be saved into
            `fastestimator_data` under the user's home directory.

    Returns:
        (train_data, eval_data, train_vocab, label_vocab)
    """
    url = 'https://sites.google.com/site/germeval2014ner/data/NER-de-train.tsv?attredirects=0&d=1'
    home = str(Path.home())

    if root_dir is None:
        root_dir = os.path.join(home, 'fastestimator_data', 'GermEval')
    else:
        root_dir = os.path.join(os.path.abspath(root_dir), 'GermEval')
    os.makedirs(root_dir, exist_ok=True)

    data_path = os.path.join(root_dir, 'de_ner.tsv')
    data_folder_path = os.path.join(root_dir, 'germeval')

    if not os.path.exists(data_folder_path):
        # download
        if not os.path.exists(data_path):
            print("Downloading data to {}".format(root_dir))
            stream = requests.get(url, stream=True)  # python wget does not work
            total_size = int(stream.headers.get('content-length', 0))
            block_size = 128  # 1 MB
            progress = tqdm(total=total_size, unit='B', unit_scale=True)
            with open(data_path, 'wb') as outfile:
                for data in stream.iter_content(block_size):
                    progress.update(len(data))
                    outfile.write(data)
            progress.close()

    x, y, x_vocab, y_vocab = get_sentences_and_labels(data_path)

    x_train, x_eval, y_train, y_eval = train_test_split(x, y, test_size=0.2, random_state=42)
    x_train = np.array(x_train)
    x_eval = np.array(x_eval)
    y_train = np.array(y_train)
    y_eval = np.array(y_eval)
    train_data = NumpyDataset({"x": x_train, "y": y_train})
    eval_data = NumpyDataset({"x": x_eval, "y": y_eval})
    return train_data, eval_data, x_vocab, y_vocab