I have a form in which a user is entering input. Sometimes it will contain weird ASCII characters. Most likely the user is copying text from a Word document. For example: This is some text  Code (markup): How would I remove those extra characters from the string before taking the data and sticking in a database (as an example)? Basically, after submitting the form I want to remove those characters and then continue processing the string.
You can do validation on ascii code. If it is not between X and Y then kick it or str_replace it. For example: function isUpperCase($char){ if(chr($char)>=65 AND chr($char)<=90) return true; else return false; } PHP: You could do the same range of ascii codes that is above some non standard. Etc... get it?
I prefer to use regular expressions like so: ereg_replace('[^A-z ]', '', $text); This will remove all non characters OR spaces from $text
Will this remove all the unwanted characters? I am using $to_replace = array("#9604;","#9632;","#9786;","#9787;", "#9829;", "#9830;", "#9827;", "#9824;","#8902;", "#9733;", "#9734;", "#9789;", "#9841;", "#9840;", "•", "#9688;", "#9675;", "#9673;", "#9678;", "#9689;", "#9794;", "#9792;", "#9834;", "#9835;", "#9788;", "#9758;", "#8597;", "#8252;", "¶", "§", "#9644;", "#8593;", "#8595;", "#8594;", "#8592;", "#8596;", "#8735;", "#9650;", "#9660;", "#9658;", "#9668;", "#9661;", "#9651;", "#9655;", "#9665;", "#9672;", "#9671;", "#9670;", "#9648;", "#9649;", "#8962;", "#8710;", "#8976;", "¬", "#9617;","¦", "#9618;", "#9619;", "#9635;", "#9636;", "#9637;", "#9638;", "#9639;", "#9640;", "#9641;", "#9474;", "#9508;", "#9569;", "#9570;", "#9558;", "#9557;", "#9571;", "#9553;", "#9559;", "#9565;", "#9564;", "#9563;", "#9488;", "#9492;", "#9524;", "#9516;", "#9500;", "#9472;", "#9532;", "#9566;", "#9567;", "#9562;", "#9556;", "#9577;", "#9574;", "#9568;", "#9552;", "#9580;", "#9575;", "#9576;", "#9572;", "#9573;", "#9561;", "#9560;", "#9554;", "#9555;", "#9579;", "#9578;", "#9496;", "#9484;", "#9608;", "#9612;", "#9616;", "#9600;", "#945;", "#915;", "#960;", "#931;", "#963;", "#964;", "#934;", "#920;", "#937;", "#948;", "#8734;", "#966;", "#949;", "#8745;", "#9696;", "#9697;", "#9581;", "#9582;", "#9583;", "#9584;", "#8801;", "#8805;", "#8804;", "#8992;", "#8993;", "#8776;", "#8729;", "#8730;", "#8319;", "#1758;", "#8362;", "¢", "£", "Â¥", "€", "#8355;", "#8359;", "ª", "º", "¿", "¬", "½", "#8531;", "#8532;", "¼", "¾", "#8539;", "#8540;", "#8541;", "#8542;", "¡", "µ", "±", "#8800;", "°", "#9450;", "²", "³", "¹", "¸", "â„¢", "¤", "‰", "#8453;", "#8470;", "†", "‡", "#9477;", "#9478;", "#9480;", "#9482;", "#9585;", "#9586;", "#9587;", "„", "…", "Æ", "Ã", "Â", "À", "Ã…", "Ã", "Ä", "#256;", "#258;", "#260;", "æ", "á", "â", "à ", "Ã¥", "ã", "ä", "#257;", "#259;", "#261;", "ß", "Ç", "#262;", "#264;", "#266;", "#268;", "ç", "#263;", "#265;", "#267;", "#269;", "#270;", "Ã", "#271;", "ð", "É", "Ê", "È", "Ë", "#274;", "#276;", "#278;", "#280;", "#282;", "é", "ê", "è", "ë", "#275;", "#277;", "#279;", "#281;", "#283;", "Æ’", "#284;", "#286;", "#288;", "#290;", "#285;", "#287;", "#289;", "#291;", "#292;", "#294;", "#293;", "#295;", "Ã", "ÃŽ", "ÃŒ", "Ã", "#296;", "#298;", "#300;", "#302;", "#304;", "Ã", "î", "ì", "ï", "#297;", "#299;", "#301;", "#303;", "#305;", "#306;", "#307;", "#308;", "#309;", "#310;", "#311;", "#312;", "#313;", "#315;", "#317;", "#319;", "#321;", "#314;", "#316;", "#318;", "#320;", "#322;", "Ñ", "#323;", "#325;", "#327;", "#330;", "ñ", "#324;", "#326;", "#328;", "#329;", "#331;", "Ó", "Ô", "Ã’", "Ø", "Õ", "Ö", "#332;", "#334;", "#336;", "#416;", "ó", "ô", "ò", "ø", "õ", "ö", "#333;", "#335;", "#337;", "#417;", "Þ", "þ", "#340;", "#342;", "#344;", "#341;", "#343;", "#345;", "#346;", "#348;", "#350;", "Å ", "#347;", "#349;", "#351;", "Å¡", "#354;", "#356;", "#358;", "#355;", "#357;", "#359;", "Ú", "Û", "Ù", "Ü", "#360;", "#362;", "#364;", "#366;", "#368;", "#370;", "#431;", "ú", "û", "ù", "ü", "#361;", "#363;", "#365;", "#367;", "#369;", "#371;", "#432;", "#372;", "#373;", "Ã", "Ÿ", "ý", "ÿ", "Ž", "#377;", "#379;", "ž", "#378;", "#380;", "Å’", "Å“", "#4347;", "#8467;", "#1108;", "#1103;", "#1080;", "#969;", "#1090;", "#1085;", "#965;", "#8494;", "#4304;", "#4305;", "#4306;", "#4307;", "#4308;", "#4309;", "#4310;", "#4311;", "#4312;", "#4313;", "#4314;", "#4315;", "#4316;", "#4317;", "#4318;", "#4319;", "#4320;", "#4321;", "#4322;", "#4323;", "#4324;", "#4325;", "#4326;", "#4327;", "#4328;", "#4329;", "#4330;", "#4331;", "#4332;", "#4333;", "#4334;", "#4335;", "#4336;", "#4337;", "#4338;", "#4339;", "#4340;", "#4341;", "#4342;", "#1329;", "#1330;", "#1331;", "#1332;", "#1333;", "#1334;", "#1335;", "#1336;", "#1337;", "#1338;", "#1339;", "#1340;", "#1341;", "#1342;", "#1343;", "#1344;", "#1345;", "#1346;", "#1347;", "#1348;", "#1349;", "#1350;", "#1351;", "#1352;", "#1353;", "#1354;", "#1355;", "#1356;", "#1357;", "#1358;", "#1359;", "#1360;", "#1361;", "#1362;", "#1363;", "#1364;", "#1365;", "#1366;", "#1377;", "#1378;", "#1379;", "#1380;", "#1381;", "#1382;", "#1383;", "#1384;", "#1385;", "#1386;", "#1387;", "#1388;", "#1389;", "#1390;", "#1391;", "#1392;", "#1393;", "#1394;", "#1395;", "#1396;", "#1397;", "#1398;", "#1399;", "#1400;", "#1401;", "#1402;", "#1403;", "#1404;", "#1405;", "#1406;", "#1407;", "#1408;", "#1409;", "#1410;", "#1411;", "#1412;", "#1413;", "#1414;", "#1415;", "#1488;", "#1489;", "#1490;", "#1491;", "#1492;", "#1493;", "#1494;", "#1495;", "#1496;", "#1497;", "#1498;", "#1499;", "#1500;", "#1501;", "#1502;", "#1503;", "#1504;", "#1505;", "#1506;", "#1507;", "#1508;", "#1509;", "#1510;", "#1511;", "#1512;", "#1513;", "#1514;", "#1570;", "#1571;", "#1572;", "#1573;", "#1574;", "#1575;", "#1576;", "#1577;", "#1578;", "#1579;", "#1580;", "#1581;", "#1582;", "#1583;", "#1584;", "#1585;", "#1586;", "#1587;", "#1588;", "#1589;", "#1590;", "#1591;", "#1592;", "#1593;", "#1594;", "#1601;", "#1602;", "#1603;", "#1604;", "#1605;", "#1606;", "#1607;", "#1608;", "#1609;", "#1610;","&#9604;","#9786;","#9787;", "&hearts;", "&diams;", "&clubs;", "&spades;", "&#8902;", "&#9733;", "&#9734;", "&#9789;", "#9829;", "&#9841;", "&#9840;", "&#0149;", "&#9688;", "&#9675;", "&#9673;", "&#9678;", "&#9689;", "&#9794;", "&#9792;", "&#9834;", "&#9835;", "&#9788;", "&#9758;", "&#8597;", "&#8252;", "&para;", "&sect;", "&#9644;", "&#8593;", "&#8595;", "&#8594;", "&#8592;", "&#8596;", "&#8735;", "&#9650;", "&#9660;", "&#9658;", "&#9668;", "&#9661;", "&#9651;", "&#9655;", "&#9665;", "&#9672;", "&#9671;", "&#9670;", "&#9648;", "&#9649;", "&#8962;", "&#8710;", "&#8976;", "&#9617;", "&#9618;", "&#9619;", "&#9635;", "&#9636;", "&#9637;", "&#9638;", "&#9639;", "&#9640;", "&#9641;", "&#9474;", "&#9508;", "&#9569;", "&#9570;", "&#9558;", "&#9557;", "&#9571;", "&#9553;", "&#9559;", "&#9565;", "&#9564;", "&#9563;", "&#9488;", "&#9492;", "&#9524;", "&#9516;", "&#9500;", "&#9472;", "&#9532;", "&#9566;", "&#9567;", "&#9562;", "&#9556;", "&#9577;", "&#9574;", "&#9568;", "&#9552;", "&#9580;", "&#9575;", "&#9576;", "&#9572;", "&#9573;", "&#9561;", "&#9560;", "&#9554;", "&#9555;", "&#9579;", "&#9578;", "&#9496;", "&#9484;", "&#9608;", "&#9612;", "&#9616;", "&#9600;", "&#945;", "&#915;", "&#960;", "&#931;", "&#963;", "&#964;", "&#934;", "&#920;", "&#937;", "&#948;", "&#8734;", "&#966;", "&#949;", "&#8745;", "&#9696;", "&#9697;", "&#9581;", "&#9582;", "&#9583;", "&#9584;", "&#8801;", "&#8805;", "&#8804;", "&#8992;", "&#8993;", "&#8776;", "&#8729;", "&#8730;", "&#8319;", "&#1758;", "&#8362;", "&cent;", "&pound;", "&yen;", "&euro;", "&#8355;", "&#8359;", "&ordf;", "&ordm;", "&iquest;", "&not;", "&frac12;", "&#8531;", "&#8532;", "&frac14;", "&frac34;", "&#8539;", "&#8540;", "&#8541;", "&#8542;", "&iexcl;", "&laquo;", "&raquo;", "&micro;", "&plusmn;", "&divide;", "&times;", "&ne;", "&deg;", "&middot;", "&#9450;", "&sup2;", "&sup3;", "&sup1;", "&acute;", "&cedil;", "&reg;", "&copy;", "&trade;", "&curren;", "&permil;", "&#8453;", "&#8470;", "&dagger;", "&Dagger;", "&uml;", "&lt;", "&gt;", "&amp;", "&brvbar;", "&#9477;", "&#9478;", "&#9480;", "&#9482;", "&#9585;", "&#9586;", "&#9587;", "&quot;", "&#130;", "&#132;", "&#133;", "&macr;", "&#150;", "&#151;", "&AElig;", "&Aacute;", "&Acirc;", "&Agrave;", "&Aring;", "&Atilde;", "&Auml;", "&#256;", "&#258;", "&#260;", "&aelig;", "&aacute;", "&acirc;", "&agrave;", "&aring;", "&atilde;", "&auml;", "&#257;", "&#259;", "&#261;", "&szlig;", "&Ccedil;", "&#262;", "&#264;", "&#266;", "&#268;", "&ccedil;", "&#263;", "&#265;", "&#267;", "&#269;", "&#270;", "&ETH;", "&#271;", "&eth;", "&Eacute;", "&Ecirc;", "&Egrave;", "&Euml;", "&#274;", "&#276;", "&#278;", "&#280;", "&#282;", "&eacute;", "&ecirc;", "&egrave;", "&euml;", "&#275;", "&#277;", "&#279;", "&#281;", "&#283;", "&#131;", "&#284;", "&#286;", "&#288;", "&#290;", "&#285;", "&#287;", "&#289;", "&#291;", "&#292;", "&#294;", "&#293;", "&#295;", "&Iacute;", "&Icirc;", "&Igrave;", "&Iuml;", "&#296;", "&#298;", "&#300;", "&#302;", "&#304;", "&iacute;", "&icirc;", "&igrave;", "&iuml;", "&#297;", "&#299;", "&#301;", "&#303;", "&#305;", "&#306;", "&#307;", "&#308;", "&#309;", "&#310;", "&#311;", "&#312;", "&#313;", "&#315;", "&#317;", "&#319;", "&#321;", "&#314;", "&#316;", "&#318;", "&#320;", "&#322;", "&Ntilde;", "&#323;", "&#325;", "&#327;", "&#330;", "&ntilde;", "&#324;", "&#326;", "&#328;", "&#329;", "&#331;", "&Oacute;", "&Ocirc;", "&Ograve;", "&Oslash;", "&Otilde;", "&Ouml;", "&#332;", "&#334;", "&#336;", "&#416;", "&oacute;", "&ocirc;", "&ograve;", "&oslash;", "&otilde;", "&ouml;", "&#333;", "&#335;", "&#337;", "&#417;", "&THORN;", "&thorn;", "&#340;", "&#342;", "&#344;", "&#341;", "&#343;", "&#345;", "&#346;", "&#348;", "&#350;", "&#352;", "&#347;", "&#349;", "&#351;", "&#353;", "&#354;", "&#356;", "&#358;", "&#355;", "&#357;", "&#359;", "&Uacute;", "&Ucirc;", "&Ugrave;", "&Uuml;", "&#360;", "&#362;", "&#364;", "&#366;", "&#368;", "&#370;", "&#431;", "&uacute;", "&ucirc;", "&ugrave;", "&uuml;", "&#361;", "&#363;", "&#365;", "&#367;", "&#369;", "&#371;", "&#432;", "&#372;", "&#373;", "&Yacute;", "&Yuml;", "&yacute;", "&yuml;", "&#142;", "&#377;", "&#379;", "&#158;", "&#378;", "&#380;", "&#140;", "&#156;", "&#4347;", "&#8467;", "&#1108;", "&#1103;", "&#1080;", "&#969;", "&#1090;", "&#1085;", "&#965;", "&#8494;", "&#4304;", "&#4305;", "&#4306;", "&#4307;", "&#4308;", "&#4309;", "&#4310;", "&#4311;", "&#4312;", "&#4313;", "&#4314;", "&#4315;", "&#4316;", "&#4317;", "&#4318;", "&#4319;", "&#4320;", "&#4321;", "&#4322;", "&#4323;", "&#4324;", "&#4325;", "&#4326;", "&#4327;", "&#4328;", "&#4329;", "&#4330;", "&#4331;", "&#4332;", "&#4333;", "&#4334;", "&#4335;", "&#4336;", "&#4337;", "&#4338;", "&#4339;", "&#4340;", "&#4341;", "&#4342;", "&#1329;", "&#1330;", "&#1331;", "&#1332;", "&#1333;", "&#1334;", "&#1335;", "&#1336;", "&#1337;", "&#1338;", "&#1339;", "&#1340;", "&#1341;", "&#1342;", "&#1343;", "&#1344;", "&#1345;", "&#1346;", "&#1347;", "&#1348;", "&#1349;", "&#1350;", "&#1351;", "&#1352;", "&#1353;", "&#1354;", "&#1355;", "&#1356;", "&#1357;", "&#1358;", "&#1359;", "&#1360;", "&#1361;", "&#1362;", "&#1363;", "&#1364;", "&#1365;", "&#1366;", "&#1377;", "&#1378;", "&#1379;", "&#1380;", "&#1381;", "&#1382;", "&#1383;", "&#1384;", "&#1385;", "&#1386;", "&#1387;", "&#1388;", "&#1389;", "&#1390;", "&#1391;", "&#1392;", "&#1393;", "&#1394;", "&#1395;", "&#1396;", "&#1397;", "&#1398;", "&#1399;", "&#1400;", "&#1401;", "&#1402;", "&#1403;", "&#1404;", "&#1405;", "&#1406;", "&#1407;", "&#1408;", "&#1409;", "&#1410;", "&#1411;", "&#1412;", "&#1413;", "&#1414;", "&#1415;", "&#1488;", "&#1489;", "&#1490;", "&#1491;", "&#1492;", "&#1493;", "&#1494;", "&#1495;", "&#1496;", "&#1497;", "&#1498;", "&#1499;", "&#1500;", "&#1501;", "&#1502;", "&#1503;", "&#1504;", "&#1505;", "&#1506;", "&#1507;", "&#1508;", "&#1509;", "&#1510;", "&#1511;", "&#1512;", "&#1513;", "&#1514;","ˆ","¨"); $title = str_replace($to_replace, " ", $title); PHP: I know my one is not the proper one. But it works
It will remove ANYTHING that isnt A-Z in either upper or lowercase OR a space. So if : $text = "934938484 hello how are you?"; will turn into $text = " hello how are you"; if you want to keep numbers too simlpy change ^A-z to ^0-9A-z
You never said you wanted that, without testing you can probably put most of the punctuation in like so ereg_replace('[^A-z0-9\.=() ]', '', $text); Code (markup): Notice I "escaped" the period with a backslack like so: \. You may have to do the same for =, ( and ) but I cant check at the minute so youll have to test yourself