1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 | using System.Text; namespace UnicodeUtility { /// <summary> /// Poorly developed by Christopher Harsch /// /// The mapping for this was created from /// https://lexsrv3.nlm.nih.gov/LexSysGroup/Projects/lvg/current/docs/designDoc/UDF/unicode/DefaultTables/symbolTable.html /// /// </summary> class UnicodeToAsciiUtility { /// <summary> /// Takes a string input that is unicode, makes several attempts to convert some characters to ASCII equivilants, then hammers the rest out into ? characters. /// /// </summary> /// <param name="inputstring">String you want to hammer into ASCII</param> /// <returns></returns> public static string unicodeToASCIIHammer(string inputstring) { byte[] _unicodeBytes = Encoding.Unicode.GetBytes(new StringBuilder(inputstring).Replace('\u00AB', '"') .Replace('\u00AD', '-') .Replace('\u00B4', '\'') .Replace('\u00BB', '"') .Replace('\u00F7', '/') .Replace('\u01C0', '|') .Replace('\u01C3', '!') .Replace('\u02B9', '\'') .Replace('\u02BA', '"') .Replace('\u02BC', '\'') .Replace('\u02C4', '^') .Replace('\u02C6', '^') .Replace('\u02C8', '\'') .Replace('\u02CB', '`') .Replace('\u02CD', '_') .Replace('\u02DC', '~') .Replace('\u0300', '`') .Replace('\u0301', '\'') .Replace('\u0302', '^') .Replace('\u0303', '~') .Replace('\u030B', '"') .Replace('\u030E', '"') .Replace('\u0331', '_') .Replace('\u0332', '_') .Replace('\u0338', '/') .Replace('\u0589', ':') .Replace('\u05C0', '|') .Replace('\u05C3', ':') .Replace('\u066A', '%') .Replace('\u066D', '*') .Replace('\u200B', ' ') .Replace('\u2010', '-') .Replace('\u2011', '-') .Replace('\u2012', '-') .Replace('\u2013', '-') .Replace('\u2014', '-') .Replace('\u2015', '-') .Replace('\u2016', '|') .Replace('\u2017', '_') .Replace('\u2018', '\'') .Replace('\u2019', '\'') .Replace('\u201A', ',') .Replace('\u201B', '\'') .Replace('\u201C', '"') .Replace('\u201D', '"') .Replace('\u201E', '"') .Replace('\u201F', '"') .Replace('\u2032', '\'') .Replace('\u2033', '"') .Replace('\u2034', '\'') .Replace('\u2035', '`') .Replace('\u2036', '"') .Replace('\u2037', '\'') .Replace('\u2038', '^') .Replace('\u2039', '<') .Replace('\u203A', '>') .Replace('\u203D', '?') .Replace('\u2044', '/') .Replace('\u204E', '*') .Replace('\u2052', '%') .Replace('\u2053', '~') .Replace('\u2060', ' ') .Replace('\u20E5', '\\') .Replace('\u2212', '-') .Replace('\u2215', '/') .Replace('\u2216', '\\') .Replace('\u2217', '*') .Replace('\u2223', '|') .Replace('\u2236', ':') .Replace('\u223C', '~') .Replace('\u2264', '<') .Replace('\u2265', '>') .Replace('\u2266', '<') .Replace('\u2267', '>') .Replace('\u2303', '^') .Replace('\u2329', '<') .Replace('\u232A', '>') .Replace('\u266F', '#') .Replace('\u2731', '*') .Replace('\u2758', '|') .Replace('\u2762', '!') .Replace('\u27E6', '[') .Replace('\u27E8', '<') .Replace('\u27E9', '>') .Replace('\u2983', '{') .Replace('\u2984', '}') .Replace('\u3003', '"') .Replace('\u3008', '<') .Replace('\u3009', '>') .Replace('\u301B', ']') .Replace('\u301C', '~') .Replace('\u301D', '"') .Replace('\u301E', '"') .Replace('\uFEFF', ' ').ToString()); byte[] _asciiBytes = Encoding.Convert(Encoding.Unicode, Encoding.ASCII, _unicodeBytes); char[] asciiChars = new char[Encoding.ASCII.GetCharCount(_asciiBytes, 0, _asciiBytes.Length)]; Encoding.ASCII.GetChars(_asciiBytes, 0, _asciiBytes.Length, asciiChars, 0); return new string(asciiChars); // return \); } } } |
Saturday, December 7, 2019
Unicode to ASCII convert with replacement
This should be a relatively trivial problem, take a unicode string and change it to a functional equivalent in ASCII. For some reason I was not able to find a very good example of this anywhere so here is my C# implementation of this.
Subscribe to:
Post Comments (Atom)
Colloquially called the Unicode Hammer
ReplyDelete