⛩️ Use get_data_splits
to split data (in train.train
module):
- Training split (e.g., 70%) to train the model
Here the model has access to both inputs and outputs to optimize its internal weights.
- Validation split (e.g., 15%) to determine model performance after each training loop (epoch)
Model does not use the outputs to optimize its weights but instead, the ouput is used to optimize training hyperparameters such as the learning rate, etc.
- Test split (e.g., 15%) to perform a one-time assessment of the model.
Our best measure of how the model may behave on new, unseen data.
LabelEncoder
Encode labels into unique indices.
# Encode labels
label_encoder = LabelEncoder()
label_encoder.fit(labels)
y = label_encoder.encode(labels)
Source code in tagifai/data.py
class LabelEncoder:
"""Encode labels into unique indices.
```python
# Encode labels
label_encoder = LabelEncoder()
label_encoder.fit(labels)
y = label_encoder.encode(labels)
```
"""
def __init__(self, class_to_index: Dict = {}) -> None:
"""Initialize the label encoder.
Args:
class_to_index (Dict, optional): mapping between classes and unique indices. Defaults to {}.
"""
self.class_to_index = class_to_index or {} # mutable defaults ;)
self.index_to_class = {v: k for k, v in self.class_to_index.items()}
self.classes = list(self.class_to_index.keys())
def __len__(self):
return len(self.class_to_index)
def __str__(self):
return f"<LabelEncoder(num_classes={len(self)})>"
def fit(self, y: List):
"""Fit a list of labels to the encoder.
Args:
y (List): raw labels.
Returns:
Fitted LabelEncoder instance.
"""
classes = np.unique(y)
for i, class_ in enumerate(classes):
self.class_to_index[class_] = i
self.index_to_class = {v: k for k, v in self.class_to_index.items()}
self.classes = list(self.class_to_index.keys())
return self
def encode(self, y: List) -> np.ndarray:
"""Encode a list of raw labels.
Args:
y (List): raw labels.
Returns:
np.ndarray: encoded labels as indices.
"""
encoded = np.zeros((len(y)), dtype=int)
for i, item in enumerate(y):
encoded[i] = self.class_to_index[item]
return encoded
def decode(self, y: List) -> List:
"""Decode a list of indices.
Args:
y (List): indices.
Returns:
List: labels.
"""
classes = []
for i, item in enumerate(y):
classes.append(self.index_to_class[item])
return classes
def save(self, fp: str) -> None:
"""Save class instance to JSON file.
Args:
fp (str): filepath to save to.
"""
with open(fp, "w") as fp:
contents = {"class_to_index": self.class_to_index}
json.dump(contents, fp, indent=4, sort_keys=False)
@classmethod
def load(cls, fp: str):
"""Load from a JSON file into a dictionary and create an instance of LabelEncoder
from the unpacked key-value pair from the dictionary.
Args:
fp (str): JSON filepath to load from.
Returns:
LabelEncoder instance.
"""
with open(fp) as fp:
kwargs = json.load(fp=fp)
return cls(**kwargs)
__init__(self, class_to_index={})
special
Initialize the label encoder.
Parameters: |
|
---|
Source code in tagifai/data.py
def __init__(self, class_to_index: Dict = {}) -> None:
"""Initialize the label encoder.
Args:
class_to_index (Dict, optional): mapping between classes and unique indices. Defaults to {}.
"""
self.class_to_index = class_to_index or {} # mutable defaults ;)
self.index_to_class = {v: k for k, v in self.class_to_index.items()}
self.classes = list(self.class_to_index.keys())
decode(self, y)
Decode a list of indices.
Parameters: |
|
---|
Returns: |
|
---|
Source code in tagifai/data.py
def decode(self, y: List) -> List:
"""Decode a list of indices.
Args:
y (List): indices.
Returns:
List: labels.
"""
classes = []
for i, item in enumerate(y):
classes.append(self.index_to_class[item])
return classes
encode(self, y)
Encode a list of raw labels.
Parameters: |
|
---|
Returns: |
|
---|
Source code in tagifai/data.py
def encode(self, y: List) -> np.ndarray:
"""Encode a list of raw labels.
Args:
y (List): raw labels.
Returns:
np.ndarray: encoded labels as indices.
"""
encoded = np.zeros((len(y)), dtype=int)
for i, item in enumerate(y):
encoded[i] = self.class_to_index[item]
return encoded
fit(self, y)
Fit a list of labels to the encoder.
Parameters: |
|
---|
Returns: |
|
---|
Source code in tagifai/data.py
def fit(self, y: List):
"""Fit a list of labels to the encoder.
Args:
y (List): raw labels.
Returns:
Fitted LabelEncoder instance.
"""
classes = np.unique(y)
for i, class_ in enumerate(classes):
self.class_to_index[class_] = i
self.index_to_class = {v: k for k, v in self.class_to_index.items()}
self.classes = list(self.class_to_index.keys())
return self
load(fp)
classmethod
Load from a JSON file into a dictionary and create an instance of LabelEncoder from the unpacked key-value pair from the dictionary.
Parameters: |
|
---|
Returns: |
|
---|
Source code in tagifai/data.py
@classmethod
def load(cls, fp: str):
"""Load from a JSON file into a dictionary and create an instance of LabelEncoder
from the unpacked key-value pair from the dictionary.
Args:
fp (str): JSON filepath to load from.
Returns:
LabelEncoder instance.
"""
with open(fp) as fp:
kwargs = json.load(fp=fp)
return cls(**kwargs)
save(self, fp)
Save class instance to JSON file.
Parameters: |
|
---|
Source code in tagifai/data.py
def save(self, fp: str) -> None:
"""Save class instance to JSON file.
Args:
fp (str): filepath to save to.
"""
with open(fp, "w") as fp:
contents = {"class_to_index": self.class_to_index}
json.dump(contents, fp, indent=4, sort_keys=False)
clean_text(text, lower=True, stem=False, stopwords=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"])
Clean raw text.
Parameters: |
|
---|
Returns: |
|
---|
Source code in tagifai/data.py
def clean_text(
text: str, lower: bool = True, stem: bool = False, stopwords=config.STOPWORDS
) -> str:
"""Clean raw text.
Args:
text (str): raw text to be cleaned.
lower (bool): whether to lowercase the text.
stem (bool): whether to stem the text.
Returns:
str: cleaned text.
"""
# Lower
if lower:
text = text.lower()
# Remove stopwords
if len(stopwords):
pattern = re.compile(r"\b(" + r"|".join(stopwords) + r")\b\s*")
text = pattern.sub("", text)
# Spacing and filters
text = re.sub(
r"([!\"'#$%&()*\+,-./:;<=>?@\\\[\]^_`{|}~])", r" \1 ", text
) # add spacing between objects to be filtered
text = re.sub("[^A-Za-z0-9]+", " ", text) # remove non alphanumeric chars
text = re.sub(" +", " ", text) # remove multiple spaces
text = text.strip() # strip white space at the ends
# Remove links
text = re.sub(r"http\S+", "", text)
# Stemming
if stem:
stemmer = PorterStemmer()
text = " ".join([stemmer.stem(word, to_lowercase=lower) for word in text.split(" ")])
return text
get_data_splits(X, y, train_size=0.7)
Generate balanced data splits.
Parameters: |
|
---|
Returns: |
|
---|
Source code in tagifai/data.py
def get_data_splits(X: pd.Series, y: np.ndarray, train_size: float = 0.7) -> Tuple:
"""Generate balanced data splits.
Args:
X (pd.Series): input features.
y (np.ndarray): encoded labels.
train_size (float, optional): proportion of data to use for training. Defaults to 0.7.
Returns:
Tuple: data splits as Numpy arrays.
"""
X_train, X_, y_train, y_ = train_test_split(X, y, train_size=train_size, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_, y_, train_size=0.5, stratify=y_)
return X_train, X_val, X_test, y_train, y_val, y_test
preprocess(df, lower, stem, min_freq)
Preprocess the data.
Parameters: |
|
---|
Returns: |
|
---|
Source code in tagifai/data.py
def preprocess(df: pd.DataFrame, lower: bool, stem: bool, min_freq: int) -> pd.DataFrame:
"""Preprocess the data.
Args:
df (pd.DataFrame): Pandas DataFrame with data.
lower (bool): whether to lowercase the text.
stem (bool): whether to stem the text.
min_freq (int): minimum # of data points a label must have.
Returns:
pd.DataFrame: Dataframe with preprocessed data.
"""
df["text"] = df.title + " " + df.description # feature engineering
df.text = df.text.apply(clean_text, lower=lower, stem=stem) # clean text
df = replace_oos_labels(
df=df, labels=config.ACCEPTED_TAGS, label_col="tag", oos_label="other"
) # replace OOS labels
df = replace_minority_labels(
df=df, label_col="tag", min_freq=min_freq, new_label="other"
) # replace labels below min freq
return df
replace_minority_labels(df, label_col, min_freq, new_label='other')
Replace minority labels with another label.
Parameters: |
|
---|
Returns: |
|
---|
Source code in tagifai/data.py
def replace_minority_labels(
df: pd.DataFrame, label_col: str, min_freq: int, new_label: str = "other"
) -> pd.DataFrame:
"""Replace minority labels with another label.
Args:
df (pd.DataFrame): Pandas DataFrame with data.
label_col (str): name of the dataframe column that has the labels.
min_freq (int): minimum # of data points a label must have.
new_label (str, optional): name of the new label to replace minority labels. Defaults to "other".
Returns:
pd.DataFrame: Dataframe with replaced minority labels.
"""
labels = Counter(df[label_col].values)
labels_above_freq = Counter(label for label in labels.elements() if (labels[label] >= min_freq))
df[label_col] = df[label_col].apply(lambda label: label if label in labels_above_freq else None)
df[label_col] = df[label_col].fillna(new_label)
return df
replace_oos_labels(df, labels, label_col, oos_label='other')
Replace out of scope (OOS) labels.
Parameters: |
|
---|
Returns: |
|
---|
Source code in tagifai/data.py
def replace_oos_labels(
df: pd.DataFrame, labels: List, label_col: str, oos_label: str = "other"
) -> pd.DataFrame:
"""Replace out of scope (OOS) labels.
Args:
df (pd.DataFrame): Pandas DataFrame with data.
labels (List): list of accepted labels.
label_col (str): name of the dataframe column that has the labels.
oos_label (str, optional): name of the new label for OOS labels. Defaults to "other".
Returns:
pd.DataFrame: Dataframe with replaced OOS labels.
"""
oos_tags = [item for item in df[label_col].unique() if item not in labels]
df[label_col] = df[label_col].apply(lambda x: oos_label if x in oos_tags else x)
return df