diff --git a/prepare_dataset.ipynb b/prepare_dataset.ipynb index 9560a4b..e9a60ed 100644 --- a/prepare_dataset.ipynb +++ b/prepare_dataset.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 91, + "execution_count": 108, "id": "93809ffc", "metadata": {}, "outputs": [ @@ -174,7 +174,7 @@ }, { "cell_type": "code", - "execution_count": 92, + "execution_count": 109, "id": "48724594", "metadata": {}, "outputs": [ @@ -239,7 +239,7 @@ }, { "cell_type": "code", - "execution_count": 93, + "execution_count": 110, "id": "e1f793b6", "metadata": {}, "outputs": [], @@ -259,7 +259,7 @@ }, { "cell_type": "code", - "execution_count": 94, + "execution_count": 111, "id": "1652a78e", "metadata": {}, "outputs": [], @@ -295,7 +295,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 112, "id": "f523dd58", "metadata": {}, "outputs": [], @@ -341,7 +341,7 @@ " line_text += text\n", " \n", " line_text = line_text.strip()\n", - " line_text = re.sub(r\"[•▪◦●❖▶■]\", \"\", line_text)\n", + " line_text = re.sub(r\"[•▪◦●❖▶■\\uf000-\\uf0ff]\", \"\", line_text)\n", " \n", " if not line_text:\n", " continue\n", @@ -415,7 +415,7 @@ }, { "cell_type": "code", - "execution_count": 96, + "execution_count": 113, "id": "9f64a8c0", "metadata": {}, "outputs": [], @@ -428,7 +428,7 @@ }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 114, "id": "41e4651d", "metadata": {}, "outputs": [],