Source code for openhands.datasets.isolated.rwth_phoenix_weather_signer03_cutout

import os
import pandas as pd
from bs4 import BeautifulSoup
from .base import BaseIsolatedDataset

[docs]class RWTH_Phoenix_Signer03_Dataset(BaseIsolatedDataset): """ German Isolated Sign language dataset from the paper: `RWTH-PHOENIX-Weather: A Large Vocabulary Sign Language Recognition and Translation Corpus. <https://www-i6.informatik.rwth-aachen.de/~forster/database-rwth-phoenix.php>` Signer03 cutout has been taken for the experiments : Image sequence - https://www-i6.informatik.rwth-aachen.de/ftp/pub/rwth-phoenix/rwth-phoenix-weather-signer03-cutout-images_20120820.tgz Anotations - https://www-i6.informatik.rwth-aachen.de/ftp/pub/rwth-phoenix/rwth-phoenix-weather-signer03-cutout_20120820.tgz """ lang_code = "gsg"
[docs] def read_glosses(self): s = set() with open(self.class_mappings_file_path , 'r') as f: data = f.read() # Bs_data = BeautifulSoup(data, "xml") Bs_data = BeautifulSoup(data, "lxml") glosses = Bs_data.find_all('orth') for gloss in glosses: s.add(gloss.text.strip(' \n\t')) self.glosses = s
[docs] def read_original_dataset(self): df = pd.read_csv(self.split_file, header=None) with open(self.split_file, 'r') as f: data = f.read() Bs_data = BeautifulSoup(data, "xml") filenames = Bs_data.find_all('recording') glosses = Bs_data.find_all('orth') for filename, gloss in zip(filenames, glosses): gloss_cat = self.gloss_to_id[gloss.text.strip(' \n\t')] instance_entry = filename.get('name'), gloss_cat self.data.append(instance_entry)