better dataset

This commit is contained in:
2025-12-06 21:24:12 +01:00
parent 7503a23b4a
commit 06395ced17

View File

@@ -2,7 +2,7 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 91, "execution_count": 108,
"id": "93809ffc", "id": "93809ffc",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -174,7 +174,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 92, "execution_count": 109,
"id": "48724594", "id": "48724594",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -239,7 +239,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 93, "execution_count": 110,
"id": "e1f793b6", "id": "e1f793b6",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@@ -259,7 +259,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 94, "execution_count": 111,
"id": "1652a78e", "id": "1652a78e",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@@ -295,7 +295,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 112,
"id": "f523dd58", "id": "f523dd58",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@@ -341,7 +341,7 @@
" line_text += text\n", " line_text += text\n",
" \n", " \n",
" line_text = line_text.strip()\n", " line_text = line_text.strip()\n",
" line_text = re.sub(r\"[•▪◦●❖▶■]\", \"\", line_text)\n", " line_text = re.sub(r\"[•▪◦●❖▶■\\uf000-\\uf0ff]\", \"\", line_text)\n",
" \n", " \n",
" if not line_text:\n", " if not line_text:\n",
" continue\n", " continue\n",
@@ -415,7 +415,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 96, "execution_count": 113,
"id": "9f64a8c0", "id": "9f64a8c0",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@@ -428,7 +428,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 97, "execution_count": 114,
"id": "41e4651d", "id": "41e4651d",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],