analysis

Looking at outliers in terms of num_strokes, num_points.

import os

import numpy as np

from singleline_dataset.dataset import *
from singleline_dataset.display import *
from singleline_dataset.fileorg import *
from singleline_dataset.strokes import *
from singleline_dataset.svg_files import *
from singleline_dataset.transforms import *

Full Runs

Dataset V1

epsilon=1.0
No max stroke threshold

## first dataset

# full_dataset = svgs_to_deltas('../../svg-dataset/sketch_mgmt/imgs_sorted/drawings_svg_cropped/', '../outputs', limit=None)
# np.savez('../datasets/v1-splice.npz', full_dataset, encoding='latin1', allow_pickle=True)

Dataset V2

epsilon=0.5
max_stroke={5,6}

# ## second dataset

# %%time

# full_dataset_eps05 = svgs_to_deltas(
#     "../../svg-dataset/sketch_mgmt/imgs_sorted/drawings_svg_cropped/",
#     "../outputs_segmented",
#     epsilon=0.5,
#     limit=None,
# )

# print("-" * 50)
# print(len(full_dataset_eps05))
# print("-" * 50)
# np.savez(
#     "../datasets/v2-splice-eps05-j15-s40.npz",
#     full_dataset_eps05,
#     encoding="latin1",
#     allow_pickle=True,
# )

Dataset V3

# full_dataset = svgs_to_deltas(
#     singleline_data_home() / "svg/epoch-20231214/0_drawings",
#     singleline_data_home() / "stroke3/epoch-20231214/0_drawings",
#     epsilon=1.0,
#     limit=None,
# )

# np.savez(
#     singleline_data_home() / "stroke3/epoch-20231214//full-v2-eps10.npz",
#     full_dataset,
#     encoding="latin1",
#     allow_pickle=True,
# )

data_home = singleline_data_home(default="../data_home")

sample_path = data_home / "stroke3/epoch-20231214/sample-v2-eps10.npz"

no env var SINGLELINE_DATA_HOME, defaulting to: None

Analysis: choosing max stroke threshold, to filter out drawings with too much complexity.

full_dataset = np.load(
    sample_path,
    encoding="latin1",
    allow_pickle=True,
)["arr_0"]
len(full_dataset)

df = stroke_summary_df(full_dataset)
df.head()

	idx	num_points	num_strokes
0	0	246	5
1	1	172	4
2	2	306	4
3	3	213	3
4	4	35	4

df.num_strokes.hist()

TODO: - map order of TSV/DF to order of dataset (or of full dataset file)

plot_strokes(deltas_to_strokes(full_dataset[0]))

s0 = deltas_to_strokes(full_dataset[0])

Chop Dataset into Train/Val

# d6 = df[df.num_strokes <= 6].sample(frac=1)
# train_size = int(len(d6) * 0.8 / 100) * 100
# val_size = len(d6) - train_size
# print(train_size, val_size, len(d6))

# d6_train = full_dataset[list(d6[:train_size].idx)]
# d6_val = full_dataset[list(d6[train_size:].idx)]
# print(len(d6_train), len(d6_val))

# np.savez(
#     "v2-splice-maxstrokes6.npz",
#     train=d6_train,
#     valid=d6_val,
#     test=d6_val,
#     encoding="latin1",
#     allow_pickle=True,
# )

1200 347 1547
1200 347

(1200, 347)

# import pandas as pd

# d6_summary = [
#     {"idx": i, "num_points": len(deltas), "num_strokes": len(deltas_to_strokes(deltas))}
#     for i, deltas in enumerate(d6_train)
# ]
# d6df = pd.DataFrame(d6_summary)
# d6df.num_strokes.hist()

# d5 = df[df.num_strokes <= 5].sample(frac=1)
# train_size = int(len(d5) * 0.8 / 100) * 100
# val_size = len(d5) - train_size
# print(train_size, val_size, len(d5))

# d5_train = full_dataset[list(d5[:1200].idx)]
# d5_val = full_dataset[list(d5[1200:].idx)]
# print(len(d5_train), len(d5_val))

# np.savez(
#     "v2-splice-maxstrokes5.npz",
#     train=d5_train,
#     valid=d5_val,
#     test=d5_val,
#     encoding="latin1",
#     allow_pickle=True,
# )

1100 362 1462
1200 262

Inspect edge cases

# df[df.num_strokes <= 2].num_points.hist()

# df[df.num_strokes <= 2].iloc[:5]

	idx	num_points	num_strokes
1455	0	177	2
1456	5	252	2
1457	12	236	2
1458	28	387	2
1459	35	303	2

# df[df.num_strokes <= 2].iloc[:5]

# tmp1 = [s for s in deltas_to_strokes(tmp) if len(s) > 0]

# strokes_to_deltas(rdp_strokes(tmp1, epsilon=1.1)).shape

# plot_strokes(rdp_strokes(tmp1, epsilon=1.1))

# import numpy as np

# max_seq_len = 20
# overflow = len(tmp) - max_seq_len
# rand_offset = int(np.random.rand() * overflow)
# overflow, rand_offset
# rand_offset = 0
# plot_strokes(deltas_to_strokes(tmp[rand_offset : rand_offset + max_seq_len]))

# df[df.num_strokes < 5].sample(frac=1).num_strokes.hist()

# len(df[df.num_strokes == 6])

# for i in range(5):
#     row = df[df.num_strokes == 6].iloc[i]
#     print(row)
#     plot_strokes(deltas_to_strokes(full_dataset[row.idx]))

# for entry in by_num_strokes[10:20]:
#     idx, num_points, num_strokes = entry
#     plot_strokes(deltas_to_strokes(full_dataset[idx]))

Find shortest strokes

# len(df[df.num_strokes == 4])

# for idx in range(len(df[df.num_strokes <= 4])):
#     print(idx)