better dataset

This commit is contained in:
2025-12-06 21:24:12 +01:00
parent 7503a23b4a
commit 06395ced17

View File

@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 91,
"execution_count": 108,
"id": "93809ffc",
"metadata": {},
"outputs": [
@@ -174,7 +174,7 @@
},
{
"cell_type": "code",
"execution_count": 92,
"execution_count": 109,
"id": "48724594",
"metadata": {},
"outputs": [
@@ -239,7 +239,7 @@
},
{
"cell_type": "code",
"execution_count": 93,
"execution_count": 110,
"id": "e1f793b6",
"metadata": {},
"outputs": [],
@@ -259,7 +259,7 @@
},
{
"cell_type": "code",
"execution_count": 94,
"execution_count": 111,
"id": "1652a78e",
"metadata": {},
"outputs": [],
@@ -295,7 +295,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 112,
"id": "f523dd58",
"metadata": {},
"outputs": [],
@@ -341,7 +341,7 @@
" line_text += text\n",
" \n",
" line_text = line_text.strip()\n",
" line_text = re.sub(r\"[•▪◦●❖▶■]\", \"\", line_text)\n",
" line_text = re.sub(r\"[•▪◦●❖▶■\\uf000-\\uf0ff]\", \"\", line_text)\n",
" \n",
" if not line_text:\n",
" continue\n",
@@ -415,7 +415,7 @@
},
{
"cell_type": "code",
"execution_count": 96,
"execution_count": 113,
"id": "9f64a8c0",
"metadata": {},
"outputs": [],
@@ -428,7 +428,7 @@
},
{
"cell_type": "code",
"execution_count": 97,
"execution_count": 114,
"id": "41e4651d",
"metadata": {},
"outputs": [],