From 06395ced17ce43e8ad3824edf73b890dbe70e7de Mon Sep 17 00:00:00 2001 From: Sergio Jimenez Jimenez Date: Sat, 6 Dec 2025 21:24:12 +0100 Subject: [PATCH] better dataset --- prepare_dataset.ipynb | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/prepare_dataset.ipynb b/prepare_dataset.ipynb index 9560a4b..e9a60ed 100644 --- a/prepare_dataset.ipynb +++ b/prepare_dataset.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 91, + "execution_count": 108, "id": "93809ffc", "metadata": {}, "outputs": [ @@ -174,7 +174,7 @@ }, { "cell_type": "code", - "execution_count": 92, + "execution_count": 109, "id": "48724594", "metadata": {}, "outputs": [ @@ -239,7 +239,7 @@ }, { "cell_type": "code", - "execution_count": 93, + "execution_count": 110, "id": "e1f793b6", "metadata": {}, "outputs": [], @@ -259,7 +259,7 @@ }, { "cell_type": "code", - "execution_count": 94, + "execution_count": 111, "id": "1652a78e", "metadata": {}, "outputs": [], @@ -295,7 +295,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 112, "id": "f523dd58", "metadata": {}, "outputs": [], @@ -341,7 +341,7 @@ " line_text += text\n", " \n", " line_text = line_text.strip()\n", - " line_text = re.sub(r\"[•▪◦●❖▶■]\", \"\", line_text)\n", + " line_text = re.sub(r\"[•▪◦●❖▶■\\uf000-\\uf0ff]\", \"\", line_text)\n", " \n", " if not line_text:\n", " continue\n", @@ -415,7 +415,7 @@ }, { "cell_type": "code", - "execution_count": 96, + "execution_count": 113, "id": "9f64a8c0", "metadata": {}, "outputs": [], @@ -428,7 +428,7 @@ }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 114, "id": "41e4651d", "metadata": {}, "outputs": [],