python - Get the row index of each extracted character from csv file -
i have column (second column called second_column) in csv file represents à list of characters , positions follow: column called character_position
each line of column contains list of character_position . overall l have 300 lines in column each list of character position
character_position = [['1', 1890, 1904, 486, 505, '8', 1905, 1916, 486, 507, '4', 1919, 1931, 486, 505, '1', 1935, 1947, 486, 505, '7', 1950, 1962, 486, 505, '2', 1965, 1976, 486, 505, '9', 1980, 1992, 486, 507, '6', 1995, 2007, 486, 505, '/', 2010, 2022, 484, 508, '4', 2025, 2037, 486, 505, '8', 2040, 2052, 486, 505, '3', 2057, 2067, 486, 507, '3', 2072, 2082, 486, 505, '0', 2085, 2097, 486, 507, '/', 2100, 2112, 484, 508, 'q', 2115, 2127, 486, 507, '1', 2132, 2144, 486, 505, '7', 2147, 2157, 486, 505, '9', 2162, 2174, 486, 505, '/', 2175, 2189, 484, 508, 'c', 2190, 2204, 487, 505, '4', 2207, 2219, 486, 505, '1', 2241, 2253, 486, 505, '/', 2255, 2268, 484, 508, '1', 2271, 2285, 486, 507, '5', 2288, 2297, 486, 505], ['d', 2118, 2132, 519, 535, '.', 2138, 2144, 529, 534, '2', 2150, 2162, 516, 535, '0', 2165, 2177, 516, 535, '4', 2180, 2192, 516, 534, '7', 2196, 2208, 516, 534, '0', 2210, 2223, 514, 535, '1', 2226, 2238, 516, 534, '8', 2241, 2253, 514, 534, '2', 2256, 2267, 514, 535, '4', 2270, 2282, 516, 534, '0', 2285, 2298, 514, 535]] each character has values : left, top, right, bottom. instance character '1' has left=1890, top=1904, right=486, bottom=505.
my file whole csv file follow :
df = pd.read_csv(filepath_or_buffer='list_characters.csv', header=none, usecols=[1], names=['character_position]) from file l created new csv file 5 columns :
column 1: character, column 2 : left , column 3 : top, column 4 : right, column 5 : bottom. cols = ['char','left','top','right','bottom'] df1 = df.character_position.str.strip('[]').str.split(', ', expand=true) df1.columns = [df1.columns % 5, df1.columns // 5] df1 = df1.stack().reset_index(drop=true) df1.columns = cols df1[cols[1:]] = df1[cols[1:]].astype(int) print (df1) char left top right bottom 0 'm' 38 104 2456 2492 1 'i' 40 102 2442 222 2 '.' 203 213 191 198 3 '3' 235 262 131 3333 4 'a' 275 347 147 239 5 'm' 363 465 145 3334 6 'a' 73 91 373 394 7 'd' 93 112 373 39 8 'd' 454 473 663 685 9 'o' 474 495 664 33 10 'a' 108 129 727 751 11 'v' 129 150 727 444 l want add 2 other column called line_number , all_chars_in_same_row 1)line_number corresponds line example 'm' 38 104 2456 2492 extracted let line 2 2) all_chars_in_same_row corresponds (spaced) characters in same row. instance
character_position = [['1', 1890, 1904, 486, 505, '8', 1905, 1916, 486, 507, '4', 1919, 1931, 486, 505, '1', 1935, 1947, 486, 505, '7', 1950, 1962, 486, 505, '2', 1965, 1976, 486, 505, '9', 1980, 1992, 486, 507, '6', 1995, 2007, 486, 505, '/', 2010, 2022, 484, 508, '4', 2025, 2037, 486, 505, '8', 2040, 2052, 486, 505, '3', 2057, 2067, 486, 507, '3', 2072, 2082, 486, 505, '0', 2085, 2097, 486, 507, '/', 2100, 2112, 484, 508, 'q', 2115, 2127, 486, 507, '1', 2132, 2144, 486, 505, '7', 2147, 2157, 486, 505, '9', 2162, 2174, 486, 505, '/', 2175, 2189, 484, 508, 'c', 2190, 2204, 487, 505, '4', 2207, 2219, 486, 505, '1', 2241, 2253, 486, 505, '/', 2255, 2268, 484, 508, '1', 2271, 2285, 486, 507, '5', 2288, 2297, 486, 505], ['d', 2118, 2132, 519, 535, '.', 2138, 2144, 529, 534, '2', 2150, 2162, 516, 535, '0', 2165, 2177, 516, 535, '4', 2180, 2192, 516, 534, '7', 2196, 2208, 516, 534, '0', 2210, 2223, 514, 535, '1', 2226, 2238, 516, 534, '8', 2241, 2253, 514, 534, '2', 2256, 2267, 514, 535, '4', 2270, 2282, 516, 534, '0', 2285, 2298, 514, 535]] l '1' '8' '4' '1' '7' , on.
more formally : all_chars_in_same_row means: write character of given row in line_number column
char left top right bottom line_number all_chars_in_same_row 0 'm' 38 104 2456 2492 line 2 'm' '2' '5' 'g' 1 'i' 40 102 2442 222 line 4 2 '.' 203 213 191 198 line 6 3 '3' 235 262 131 3333 4 'a' 275 347 147 239 5 'm' 363 465 145 3334 6 'a' 73 91 373 394 7 'd' 93 112 373 39 8 'd' 454 473 663 685 9 'o' 474 495 664 33 10 'a' 108 129 727 751 11 'v' 129 150 727 444 edit1:
import pandas pd df_data=pd.read_csv('/home/ahmed/internship/cnn_ocr/list_characters.csv') df_data.shape (50, 3)
df_data.icol(1) 0 [['m', 38, 104, 2456, 2492, 'i', 40, 102, 2442... 1 [['.', 203, 213, 191, 198, '3', 235, 262, 131,... 2 [['a', 275, 347, 147, 239, 'm', 363, 465, 145,... 3 [['a', 73, 91, 373, 394, 'd', 93, 112, 373, 39... 4 [['d', 454, 473, 663, 685, 'o', 474, 495, 664,... 5 [['a', 108, 129, 727, 751, 'v', 129, 150, 727,... 6 [['n', 34, 51, 949, 970, '/', 52, 61, 948, 970... 7 [['s', 1368, 1401, 43, 85, 'a', 1406, 1446, 43... 8 [['s', 1437, 1457, 112, 138, 'o', 1458, 1476, ... 9 [['h', 1686, 1703, 315, 339, 't', 1706, 1715, ... 10 [['n', 1331, 1349, 370, 391, 'c', 1361, 1379, ... 11 [['n', 1758, 1775, 370, 391, 'd', 1785, 1803, ... 12 [['d', 2166, 2184, 370, 391, 'a', 2186, 2205, ... 13 [['2', 1395, 1415, 427, 454, '0', 1416, 1434, ... 14 [['i', 1533, 1545, 487, 541, 'i', 1548, 1551, ... 15 [['p', 1659, 1677, 490, 514, '2', 1680, 1697, ... 16 [['1', 1890, 1904, 486, 505, '8', 1905, 1916, ... 17 [['b', 1344, 1361, 583, 607, 'o', 1364, 1386, ... 18 [['b', 1548, 1580, 979, 1015, 't', 1586, 1619,... 19 [['q', 169, 190, 1291, 1312, 'u', 192, 210, 12... 20 [['1', 296, 305, 1492, 1516, 's', 339, 357, 14... 21 [['g', 339, 362, 1815, 1840, 's', 365, 384, 18... 22 [['2', 1440, 1455, 2047, 2073, '9', 1458, 1475... 23 [['r', 339, 360, 2137, 2163, 'e', 363, 378, 21... 24 [['r', 339, 360, 1860, 1885, 'e', 363, 380, 18... 25 [['0', 1266, 1283, 1951, 1977, ',', 1287, 1290... 26 [['1', 2207, 2217, 1492, 1515, '0', 2225, 2240... 27 [['1', 2364, 2382, 1552, 1585], [], ['e', 2369... 28 [['s', 2369, 2382, 1833, 1866]] 29 [['0', 2243, 2259, 1951, 1977, '0', 2271, 2288... 30 [['0', 2243, 2259, 2227, 2253, '0', 2271, 2286... 31 [['d', 76, 88, 2580, 2596, 'é', 91, 100, 2580,... 32 [['ü', 1474, 1489, 2586, 2616, '3', 1541, 1557... 33 [['e', 1440, 1461, 2670, 2697, 'u', 1466, 1488... 34 [['2', 1685, 1703, 2670, 2697, '.', 1707, 1712... 35 [['1', 2202, 2213, 2668, 2695, '3', 2220, 2237... 36 [['c', 88, 118, 2872, 2902]] 37 [['n', 127, 144, 2889, 2910, 'd', 156, 175, 28... 38 [['e', 108, 129, 3144, 3172, 'c', 133, 156, 31... 39 [['5', 108, 126, 3204, 3231, '0', 129, 147, 32... 40 [[]] 41 [['1', 480, 492, 3202, 3229, '6', 500, 518, 32... 42 [['p', 217, 234, 3337, 3360, 'a', 235, 255, 33... 43 [[]] 44 [['i', 954, 963, 2892, 2934, 'm', 969, 1011, 2... 45 [['e', 1385, 1407, 2970, 2998, 'u', 1410, 1433... 46 [['t', 2067, 2084, 2889, 2911, 'o', 2088, 2106... 47 [['1', 2201, 2213, 2970, 2997, '6', 2219, 2238... 48 [['m', 1734, 1755, 3246, 3267, 'o', 1758, 1779... 49 [['l', 923, 935, 3411, 3430, 'a', 941, 957, 34... name: character_position, dtype: object then in char.csv l following
df = pd.read_csv('list_characters.csv', header=none, usecols=[1], names=['character_position']) df = df.replace(['\[','\]'], ['',''], regex=true) cols = ['char','left','right','top','bottom'] df1 = df.positionlrtb.str.strip('[]').str.split(',', expand=true) df1.columns = [df1.columns % 5, df1.columns // 5] df1 = df1.stack().reset_index(drop=true) df1.columns = cols df1.char = df1.char.replace(['\[','\]'], ['',''], regex=true) df1['left']=df1['left'].replace(['\[','\]'], ['',''], regex=true) df1['top']=df1['top'].replace(['\[','\]'], ['',''], regex=true) df1['right']=df1['right'].replace(['\[','\]'], ['',''], regex=true) df1['bottom']=df1['bottom'].replace(['\[','\]'], ['',''], regex=true) df1.to_csv('chars.csv') however l don't see in response how added columns from_line , all_char_in_same_rows.
when l execute line of code :
df_data = df_data.character_position.str.strip('[]').str.split(',', expand=true) l following :
df_data[0:10] 0 1 2 3 4 5 6 7 8 9 ... \ 0 'm' 38 104 2456 2492 'i' 40 102 2442 2448 ... 1 '.' 203 213 191 198 '3' 235 262 131 198 ... 2 'a' 275 347 147 239 'm' 363 465 145 239 ... 3 'a' 73 91 373 394 'd' 93 112 373 396 ... 4 'd' 454 473 663 685 'o' 474 495 664 687 ... 5 'a' 108 129 727 751 'v' 129 150 727 753 ... 6 'n' 34 51 949 970 '/' 52 61 948 970 ... 7 's' 1368 1401 43 85 'a' 1406 1446 43 85 ... 8 's' 1437 1457 112 138 'o' 1458 1476 118 138 ... 9 'h' 1686 1703 315 339 't' 1706 1715 316 339 ... 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 0 none none none none none none none none none none 1 none none none none none none none none none none 2 none none none none none none none none none none 3 none none none none none none none none none none 4 none none none none none none none none none none 5 none none none none none none none none none none 6 none none none none none none none none none none here 10 first lines of csv file :
character_position 0 [['m', 38, 104, 2456, 2492, 'i', 40, 102, 2442, 2448, 'i', 40, 100, 2402, 2410, 'l', 40, 102, 2372, 2382, 'm', 40, 102, 2312, 2358, 'u', 40, 102, 2292, 2310, 'i', 40, 104, 2210, 2260, 'l', 40, 104, 2180, 2208, 'i', 40, 104, 2140, 2166, 'l', 40, 104, 2124, 2134]] 1 [['.', 203, 213, 191, 198, '3', 235, 262, 131, 198]] 2 [['a', 275, 347, 147, 239, 'm', 363, 465, 145, 239, 's', 485, 549, 145, 243, 'u', 569, 631, 145, 241, 'n', 657, 733, 145, 239]] 3 [['a', 73, 91, 373, 394, 'd', 93, 112, 373, 396, 'r', 115, 133, 373, 396, 'e', 136, 153, 373, 396, 's', 156, 172, 373, 396, 's', 175, 192, 373, 396, 'e', 195, 211, 373, 396, 'd', 222, 241, 373, 396, 'e', 244, 261, 373, 396, 'l', 272, 285, 375, 396, 'i', 288, 293, 375, 396, 'v', 296, 314, 375, 396, 'r', 317, 334, 373, 396, 'a', 334, 354, 375, 396, 'i', 357, 360, 373, 396, 's', 365, 381, 373, 396, 'o', 384, 405, 373, 396, 'n', 408, 425, 373, 394]] 4 [['d', 454, 473, 663, 685, 'o', 474, 495, 664, 687, 'c', 498, 516, 664, 687, 'u', 519, 536, 663, 687, 'm', 540, 561, 663, 687, 'e', 564, 581, 663, 685, 'n', 584, 600, 664, 685, 't', 603, 618, 663, 685]] 5 [['a', 108, 129, 727, 751, 'v', 129, 150, 727, 753, 'o', 153, 175, 727, 753, 'i', 178, 183, 727, 751, 'r', 187, 210, 727, 751, 's', 220, 240, 727, 753, 'u', 243, 263, 727, 753, 'r', 267, 288, 727, 751, 'f', 302, 318, 727, 751, 'a', 320, 341, 727, 751, 'c', 342, 363, 726, 751, 't', 366, 384, 726, 750, 'u', 387, 407, 727, 751, 'r', 411, 432, 727, 751, 'e', 435, 453, 726, 751, 'p', 797, 815, 727, 751, 'a', 818, 839, 727, 751, 'g', 840, 863, 727, 751, 'e', 867, 885, 726, 751, '1', 900, 911, 727, 751, '1', 926, 934, 727, 751, '1', 947, 956, 727, 751, '5', 962, 979, 727, 751], ['r', 120, 142, 778, 807, 't', 144, 165, 778, 805, 't', 178, 199, 778, 805, 'e', 201, 219, 786, 807, 'c', 222, 240, 786, 807, 'h', 241, 258, 778, 807, 'n', 263, 279, 786, 807, 'i', 284, 287, 778, 805, 'c', 291, 308, 786, 807, 'a', 309, 327, 786, 807, 'r', 350, 374, 778, 807, 'e', 377, 395, 786, 807, 't', 396, 405, 780, 805, 'u', 408, 425, 786, 807, 'r', 429, 440, 786, 807, 'n', 441, 458, 786, 807, '-', 471, 482, 793, 798, 'd', 497, 518, 778, 807, 'o', 522, 548, 777, 807, 'a', 549, 573, 778, 807, '/', 585, 596, 778, 807, 'd', 606, 630, 778, 807, 'a', 632, 656, 778, 807, 'p', 659, 680, 778, 805]] 6 [['n', 34, 51, 949, 970, '/', 52, 61, 948, 970, 'c', 63, 81, 948, 970, 'o', 84, 103, 948, 970, 'm', 106, 127, 949, 970, 'm', 130, 151, 948, 970, 'a', 153, 172, 949, 970, 'n', 175, 192, 949, 970, 'd', 195, 213, 948, 970, 'e', 217, 232, 948, 970], ['1', 73, 84, 993, 1020, '1', 94, 105, 993, 1020, '8', 112, 130, 991, 1020, '4', 135, 153, 993, 1018, '5', 156, 172, 994, 1018, '7', 175, 192, 993, 1018, '6', 195, 213, 993, 1020, '0', 216, 235, 991, 1020, '6', 238, 257, 993, 1020, '5', 260, 278, 993, 1020, '0', 407, 425, 991, 1020, '9', 428, 446, 991, 1020, '.', 450, 455, 1015, 1020, '0', 459, 477, 991, 1020, '1', 485, 494, 994, 1018, '.', 503, 507, 1015, 1020, '2', 512, 530, 991, 1020, '0', 533, 551, 991, 1020, '1', 555, 566, 993, 1020, '5', 575, 593, 993, 1020, 'r', 632, 656, 991, 1020, 'm', 659, 684, 991, 1020, 'a', 689, 713, 991, 1020, 'n', 726, 747, 993, 1020, 'o', 752, 770, 999, 1020, '.', 774, 779, 1015, 1020, '5', 794, 812, 993, 1020, '8', 815, 833, 991, 1020, '4', 834, 852, 993, 1017, '4', 857, 873, 994, 1018, '3', 878, 896, 991, 1020, '8', 899, 917, 991, 1020, '0', 920, 938, 991, 1020, '/', 950, 960, 991, 1020, '0', 971, 990, 993, 1020, '7', 995, 1011, 993, 1018, '1', 1016, 1026, 993, 1018, '6', 1034, 1052, 993, 1020, '7', 1055, 1073, 993, 1020, '4', 1076, 1094, 993, 1018, '8', 1098, 1116, 991, 1020, '9', 1119, 1137, 991, 1020, '0', 1140, 1158, 993, 1020, '9', 1160, 1178, 991, 1020], ['n', 34, 51, 1045, 1066, '/', 54, 61, 1045, 1066, 'b', 63, 79, 1044, 1066, 'o', 82, 102, 1044, 1066, 'n', 105, 121, 1045, 1066, 'd', 133, 151, 1045, 1066, 'e', 156, 172, 1044, 1066, 'l', 183, 196, 1045, 1066, 'i', 199, 204, 1045, 1066, 'v', 205, 223, 1045, 1066, 'r', 226, 244, 1045, 1066, 'a', 246, 266, 1045, 1066, 'i', 267, 272, 1045, 1066, 's', 275, 291, 1044, 1066, 'o', 294, 314, 1045, 1066, 'n', 318, 335, 1045, 1066], ['8', 72, 90, 1093, 1122, '2', 93, 109, 1093, 1122, '5', 114, 132, 1095, 1122, '9', 135, 153, 1093, 1122, '7', 154, 172, 1095, 1122, '1', 178, 189, 1093, 1122, '3', 196, 214, 1093, 1122, '1', 220, 231, 1095, 1122, '0', 238, 257, 1093, 1122, '3', 260, 278, 1093, 1122, '0', 407, 425, 1093, 1122, '6', 429, 447, 1095, 1122, '.', 452, 455, 1117, 1122, '0', 459, 477, 1093, 1122, '2', 480, 498, 1093, 1122, '.', 503, 507, 1117, 1122, '2', 512, 530, 1093, 1122, '0', 533, 551, 1093, 1122, '1', 557, 567, 1095, 1122, '5', 575, 593, 1095, 1122], ['v', 70, 90, 1150, 1171, '/', 88, 97, 1150, 1171, 'r', 100, 118, 1150, 1171, 'é', 121, 136, 1144, 1173, 'f', 141, 156, 1150, 1171, 'ê', 159, 174, 1144, 1173, 'r', 177, 195, 1150, 1173, 'e', 198, 214, 1150, 1171, 'n', 217, 234, 1150, 1171, 'c', 238, 257, 1149, 1171, 'e', 260, 276, 1149, 1173, 'b', 476, 497, 1152, 1179, 'o', 501, 527, 1149, 1179, 'g', 530, 555, 1150, 1180, 'd', 560, 582, 1152, 1179, 'o', 585, 611, 1149, 1179, 'a', 614, 638, 1150, 1179, '1', 642, 653, 1152, 1179, '5', 659, 677, 1153, 1180, 'b', 681, 701, 1152, 1179, 't', 705, 726, 1152, 1179, '0', 728, 746, 1152, 1179, '6', 749, 767, 1152, 1179]] 7 [['s', 1368, 1401, 43, 85, 'a', 1406, 1446, 43, 85, 'm', 1451, 1491, 36, 85, 's', 1500, 1533, 43, 85, 'u', 1539, 1574, 43, 85, 'n', 1581, 1616, 43, 85, 'g', 1623, 1662, 42, 85, 'e', 1686, 1719, 43, 85, 'l', 1725, 1755, 43, 85, 'e', 1763, 1794, 42, 85, 'c', 1800, 1836, 43, 85, 't', 1841, 1874, 42, 85, 'r', 1880, 1914, 42, 84, 'o', 1919, 1959, 42, 85, 'n', 1965, 1998, 42, 84, 'i', 2007, 2016, 42, 84, 'c', 2022, 2058, 42, 84, 's', 2066, 2099, 42, 84, 'f', 2121, 2151, 42, 84, 'r', 2159, 2193, 42, 84, 'a', 2198, 2237, 40, 84, 'n', 2243, 2277, 40, 84, 'c', 2285, 2321, 42, 84, 'e', 2328, 2360, 40, 84]] 8 [['s', 1437, 1457, 112, 138, 'o', 1458, 1476, 118, 138, 'c', 1479, 1493, 120, 138, 'i', 1494, 1499, 112, 136, 'é', 1503, 1518, 114, 138, 't', 1520, 1527, 115, 138, 'é', 1530, 1547, 112, 138, 'p', 1559, 1575, 120, 144, 'a', 1577, 1593, 118, 138, 'r', 1596, 1607, 118, 136, 'a', 1616, 1637, 112, 136, 'c', 1640, 1653, 118, 138, 't', 1655, 1664, 115, 136, 'i', 1665, 1670, 112, 136, 'o', 1673, 1688, 118, 138, 'n', 1692, 1707, 118, 136, 's', 1710, 1725, 118, 138, 's', 1736, 1755, 112, 138, 'i', 1760, 1763, 112, 136, 'm', 1767, 1791, 118, 136, 'p', 1794, 1811, 118, 142, 'l', 1812, 1817, 112, 136, 'i', 1821, 1824, 112, 136, 'f', 1827, 1835, 112, 136, 'i', 1835, 1841, 112, 136, 'é', 1845, 1860, 112, 136, 'e', 1863, 1878, 118, 136, 'a', 1890, 1907, 118, 138, 'u', 1910, 1925, 118, 136, 'c', 1937, 1958, 112, 136, 'a', 1961, 1977, 118, 136, 'p', 1980, 1995, 118, 142, 'i', 1998, 2003, 112, 136, 't', 2006, 2013, 114, 136, 'a', 2015, 2030, 118, 136, 'l', 2034, 2037, 112, 136, 'd', 2051, 2066, 111, 136, 'e', 2069, 2085, 117, 136, '2', 2097, 2112, 112, 136, '7', 2115, 2132, 111, 136, '.', 2136, 2139, 132, 136, '0', 2144, 2159, 111, 136, '0', 2162, 2178, 111, 136, '0', 2180, 2196, 111, 136, '.', 2201, 2205, 132, 135, '0', 2208, 2225, 111, 136, '0', 2228, 2243, 111, 136, '0', 2246, 2261, 111, 136, 't', 2273, 2281, 112, 135, 'i', 2281, 2291, 111, 136], ['1', 1473, 1482, 153, 177, ',', 1491, 1494, 172, 181, 'r', 1508, 1517, 159, 177, 'u', 1520, 1535, 160, 177, 'e', 1538, 1554, 159, 177, 'f', 1566, 1583, 153, 177, 'r', 1587, 1596, 159, 177, 'u', 1598, 1613, 159, 177, 'c', 1617, 1631, 159, 177, 't', 1634, 1641, 154, 177, 'i', 1643, 1646, 153, 177, 'd', 1650, 1665, 151, 177, 'o', 1668, 1685, 159, 177, 'r', 1688, 1697, 159, 177, 'c', 1709, 1730, 153, 177, 's', 1733, 1751, 153, 177, '2', 1764, 1779, 153, 177, '0', 1781, 1797, 153, 177, '0', 1800, 1817, 153, 177, '3', 1820, 1835, 151, 177, '9', 1847, 1863, 151, 177, '3', 1866, 1883, 151, 177, '4', 1883, 1901, 153, 175, '8', 1904, 1919, 151, 177, '4', 1919, 1937, 153, 175, 's', 1950, 1968, 151, 177, 'a', 1971, 1992, 151, 175, 'i', 1995, 2000, 151, 175, 'n', 2004, 2024, 151, 175, 't', 2027, 2046, 151, 175, 'o', 2058, 2081, 151, 177, 'u', 2085, 2105, 151, 177, 'e', 2109, 2127, 151, 177, 'n', 2130, 2150, 151, 175, 'c', 2163, 2186, 151, 175, 'e', 2187, 2204, 157, 175, 'd', 2207, 2222, 150, 175, 'e', 2225, 2240, 157, 175, 'x', 2243, 2258, 157, 175], ['t', 1638, 1656, 192, 216, 'É', 1659, 1677, 186, 217, 'l', 1682, 1697, 193, 217, 'É', 1701, 1719, 187, 217, 'p', 1722, 1742, 192, 217, 'h', 1746, 1766, 193, 217, 'o', 1770, 1793, 192, 217, 'n', 1796, 1815, 192, 216, 'e', 1820, 1838, 192, 217, '0', 1869, 1886, 190, 216, '1', 1890, 1899, 192, 216, '4', 1914, 1931, 193, 216, '4', 1934, 1950, 193, 216, '0', 1961, 1977, 190, 216, '4', 1980, 1997, 193, 216, '7', 2009, 2024, 192, 216, '0', 2027, 2042, 192, 216, '0', 2055, 2070, 192, 216, '0', 2073, 2090, 192, 216], ['r', 1517, 1538, 232, 258, '.', 1542, 1545, 253, 256, 'c', 1550, 1571, 232, 256, '.', 1575, 1580, 252, 256, 's', 1584, 1602, 232, 256, '.', 1607, 1611, 252, 256, 'b', 1625, 1643, 232, 256, 'o', 1649, 1670, 231, 258, 'b', 1674, 1692, 232, 256, 'i', 1697, 1701, 232, 256, 'g', 1706, 1728, 232, 256, 'n', 1731, 1751, 232, 256, 'y', 1754, 1775, 232, 256, 'b', 1788, 1806, 232, 256, '3', 1818, 1835, 231, 256, '3', 1838, 1855, 231, 256, '4', 1855, 1872, 232, 255, '3', 1884, 1899, 232, 256, '6', 1904, 1919, 232, 256, '7', 1922, 1937, 232, 256, '4', 1947, 1964, 232, 256, '9', 1967, 1983, 232, 256, '7', 1986, 2001, 232, 256, '-', 2013, 2022, 244, 249, 'a', 2034, 2055, 231, 255, 'p', 2057, 2075, 231, 255, 'e', 2079, 2097, 231, 256, '4', 2109, 2126, 232, 255, '6', 2129, 2145, 232, 256, '5', 2148, 2163, 232, 256, '2', 2166, 2183, 232, 255, 'z', 2193, 2211, 231, 255], ['c', 1628, 1647, 271, 297, 'o', 1652, 1670, 279, 297, 'd', 1671, 1689, 273, 297, 'e', 1692, 1709, 279, 298, 't', 1721, 1739, 273, 297, 'v', 1742, 1763, 273, 297, 'a', 1763, 1787, 273, 297, 'f', 1818, 1835, 273, 297, 'r', 1839, 1859, 273, 297, '8', 1872, 1889, 273, 297, '9', 1890, 1905, 273, 297, '3', 1919, 1932, 273, 297, '3', 1937, 1952, 273, 297, '4', 1953, 1971, 273, 297, '3', 1983, 1998, 273, 297, '6', 2001, 2018, 273, 297, '7', 2021, 2036, 273, 295, '4', 2048, 2064, 274, 297, '9', 2066, 2082, 273, 297, '7', 2085, 2100, 273, 295]] 9 [['h', 1686, 1703, 315, 339, 't', 1706, 1715, 316, 339, 't', 1718, 1727, 316, 339, 'p', 1730, 1748, 321, 345, 'i', 1751, 1757, 321, 339, 'f', 1760, 1769, 315, 339, '/', 1769, 1776, 313, 339, 'w', 1779, 1804, 321, 337, 'w', 1804, 1829, 321, 339, 'w', 1830, 1854, 321, 337, '.', 1859, 1863, 333, 337, 's', 1868, 1883, 319, 339, 'a', 1886, 1901, 321, 337, 'm', 1905, 1929, 321, 337, 's', 1932, 1949, 321, 339, 'u', 1953, 1968, 321, 339, 'n', 1973, 1989, 321, 339, 'g', 1992, 2010, 319, 345, '.', 2015, 2019, 333, 337, 'f', 2021, 2033, 313, 337, 'r', 2034, 2045, 319, 337]] 10 [['n', 1331, 1349, 370, 391, 'c', 1361, 1379, 370, 393, 'o', 1382, 1403, 370, 393, 'm', 1404, 1425, 370, 391, 'p', 1430, 1446, 370, 391, 't', 1448, 1464, 370, 391, 'e', 1467, 1484, 370, 393, 'c', 1494, 1512, 370, 393, 'l', 1515, 1532, 370, 393, 'i', 1533, 1539, 370, 393, 'e', 1542, 1559, 370, 393, 'n', 1560, 1580, 370, 393, 't', 1580, 1598, 370, 393]] here second csv file:
char left right top bottom 0 'm' 38 104 2456 2492 1 'i' 40 102 2442 2448 2 'i' 40 100 2402 2410 3 'l' 40 102 2372 2382 4 'm' 40 102 2312 2358 5 'u' 40 102 2292 2310 6 'i' 40 104 2210 2260 7 'l' 40 104 2180 2208 8 'i' 40 104 2140 2166 edit1
here output solution 2 (`input character_position described` ) 1831 1830 level_2 char left top right bottom fromline all_chars_in_same_row 0 0 character_position 0 character_position 0 character_position 1 1 'm','i','i','l','m','u','i','l','i','l' 0 'm' 38 104 2456 2492 1 'm','i','i','l','m','u','i','l','i','l' 2 1 'm','i','i','l','m','u','i','l','i','l' 1 'i' 40 102 2442 2448 1 'm','i','i','l','m','u','i','l','i','l' 3 1 'm','i','i','l','m','u','i','l','i','l' 2 'i' 40 100 2402 2410 1 'm','i','i','l','m','u','i','l','i','l' l think probelm comes fact l have in data : [[',' , 'a', ',' , '.', ':' , ';', '1'], [], ['m', 'a',]] :
empty `[ ]` causes problem order. l noticed when l tried omit [] empty beacause l find csv follow : in char : ['a' rather 'a' values 8794] rather 8794 or [5345 rather 5345 processed csv follow
df = pd.read_csv(filepath_or_buffer='lit_charaters.csv', header=none, usecols=[1,3], names=['character_position','lineindex']) df = df.replace(['\[','\]'], ['',''], regex=true) cols = ['char','left','right','top','bottom','lineindex'] df1 = df.positionlrtb.str.strip('[]').str.split(',', expand=true) df1.columns = [df1.columns % 5, df1.columns // 5] df1 = df1.stack().reset_index(drop=true) df1.columns = cols df1.char = df1.char.replace(['\[','\]'], ['',''], regex=true) df1['left']=df1['left'].replace(['\[','\]'], ['',''], regex=true) df1['top']=df1['top'].replace(['\[','\]'], ['',''], regex=true) df1['right']=df1['right'].replace(['\[','\]'], ['',''], regex=true) df1['bottom']=df1['bottom'].replace(['\[','\]'], ['',''], regex=true) df1.to_csv('char.csv') then l noticed following
look @ line 1221 column b it's empty replaces [] disorder of columns switched (b , c) due empty char . how solve ? l have empty line
3831 '6' 296 314 3204 3231 3832 3833 '1' 480 492 3202 3229 line 3832 should removed.
**edit2:** in order solve problem of empty rows , [] in list_characters.csv
[['1', 2364, 2382, 1552, 1585], [], ['e', 2369, 2381, 1623, 1640], ['8', 2369, 2382, 1644, 1668]] , [[]] [[]]
l did following :
df1 = df.applymap(lambda x: [y y in x if len(y) > 0]) df1 = df1[df1.applymap(len).ne(0).all(axis=1)] df1 = df.replace(['\[\],','\[\[\]\]', ''],['','', np.nan], regex=true) df1 = df1.dropna() df = pd.read_csv('character_position.csv', index_col=0) df.positionlrtb = df.positionlrtb.apply(ast.literal_eval) df.positionlrtb = df.positionlrtb.apply(lambda x: [y y in x if len(y) > 0]) print (df.head()) page_number positionlrtb \ 0 1841729699_001 [[m, 38, 104, 2456, 2492, i, 40, 102, 2442, 24... 1 1841729699_001 [[., 203, 213, 191, 198, 3, 235, 262, 131, 198]] 2 1841729699_001 [[a, 275, 347, 147, 239, m, 363, 465, 145, 239... 3 1841729699_001 [[a, 73, 91, 373, 394, d, 93, 112, 373, 396, r... 4 1841729699_001 [[d, 454, 473, 663, 685, o, 474, 495, 664, 687... lineindex 0 [[mi, il, mu, il, il]] 1 [[.3]] 2 [[amsun]] 3 [[adresse, de, livraison]] 4 [[document]] cols = ['char','left','top','right','bottom'] df1 = pd.dataframe({ "a": np.repeat(df.page_number.values, df.positionlrtb.str.len()), "b": list(chain.from_iterable(df.positionlrtb))}) df1 = pd.dataframe(df1.b.values.tolist()) df1.columns = [df1.columns % 5, df1.columns // 5] df1 = df1.stack().reset_index(drop=true) cols = ['char','left','top','right','bottom'] df1.columns = cols df1[cols[1:]] = df1[cols[1:]].astype(int) print (df1) char left top right bottom 0 m 38 104 2456 2492 1 40 102 2442 2448 2 40 100 2402 2410 3 l 40 102 2372 2382 4 m 40 102 2312 2358 5 u 40 102 2292 2310 6 40 104 2210 2260 7 l 40 104 2180 2208 8 40 104 2140 2166 however :
df_data = df_data.character_position.str.strip('[]').str.split(', ', expand=true) returns none values
once create required data frame, after stacking , don't remove index, holds line number. since multilevel indexing , first index- line number.
df_data['lineindex'] = df_data.index.get_level_values(0) then can group lineindex column , characters common lineindex. created dictionary. convert dictionary data frame , merge actual data
solution 1
import pandas pd df_data=pd.read_csv('list_characters.csv' , header=none, usecols=[1], names=['character_position']) df_data = df_data.character_position.str.strip('[]').str.split(', ', expand=true) df_data.columns = [df_data.columns % 5, df_data.columns // 5] df_data = df_data.stack() # dont remove index, has line record created print df_data df_data['fromline'] = df_data.index.get_level_values(0) #assign line number column cols = ['char','left','top','right','bottom','fromline'] df_data.columns = cols #assign new column names #create new dictionary #it contains line number key , characters line value dictchar= {k: list(v) k,v in df_data.groupby("fromline")["char"]} #convert dictionary dataframe df_chars=pd.dataframe(dictchar.items()) df_chars.columns=cols = ['fromline','char'] # merge dataframes on column 'fromline' df_final=df_data.merge(df_chars,on ='fromline') cols = ['char','left','top','right','bottom','fromline','all_chars_in_same_row'] df_final.columns=cols print df_final solution 2
i prefer solution on first one. see inline comments more details
import pandas pd df_data=pd.read_csv('list_characters.csv', header=none, usecols=[1], names=['character_position']) df_data = df_data.character_position.str.strip('[]').str.split(', ', expand=true) x=len(df_data.columns) #get total number of columns #get characters every 5th column, concatenate , create new column in df_data df_data[x] = df_data[df_data.columns[::5]].apply(lambda x: ','.join(x.dropna()), axis=1) # index of each row. line number record df_data[x+1]=df_data.index.get_level_values(0) # set line number , character columns index of data frame df_data.set_index([x+1,x],inplace=true,drop=true) df_data.columns = [df_data.columns % 5, df_data.columns // 5] df_data = df_data.stack() df_data['fromline'] = df_data.index.get_level_values(0) #assign line number column df_data['all_chars_in_same_row'] = df_data.index.get_level_values(1) #assign character values column cols = ['char','left','top','right','bottom','fromline','all_chars_in_same_row'] df_data.columns=cols df_data.reset_index(inplace=true) #remove mutiindexing print df_data[cols] output
char left top right bottom line all_chars_in_same_row 0 '.' 203 213 191 198 0 ['.', '3', 'c'] 1 '3' 1758 1775 370 391 0 ['.', '3', 'c'] 2 'c' 296 305 1492 1516 0 ['.', '3', 'c'] 3 'a' 275 347 147 239 1 ['a', 'm', 'd'] 4 'm' 2166 2184 370 391 1 ['a', 'm', 'd'] 5 'd' 339 362 1815 1840 1 ['a', 'm', 'd'] 6 'a' 73 91 373 394 2 ['a', 'd', 'a'] 7 'd' 1395 1415 427 454 2 ['a', 'd', 'a'] 8 'a' 1440 1455 2047 2073 2 ['a', 'd', 'a'] 9 'd' 454 473 663 685 3 ['d', 'o', '0'] 10 'o' 1533 1545 487 541 3 ['d', 'o', '0'] 11 '0' 339 360 2137 2163 3 ['d', 'o', '0'] 12 'a' 108 129 727 751 4 ['a', 'v', 'i'] 13 'v' 1659 1677 490 514 4 ['a', 'v', 'i'] 14 'i' 339 360 1860 1885 4 ['a', 'v', 'i'] 15 'n' 34 51 949 970 5 ['n', '/', '2'] 16 '/' 1890 1904 486 505 5 ['n', '/', '2'] 17 '2' 1266 1283 1951 1977 5 ['n', '/', '2'] 18 's' 1368 1401 43 85 6 ['s', 'a', '8'] 19 'a' 1344 1361 583 607 6 ['s', 'a', '8'] 20 '8' 2207 2217 1492 1515 6 ['s', 'a', '8'] 21 's' 1437 1457 112 138 7 ['s', 'o', 'o'] 22 'o' 1548 1580 979 1015 7 ['s', 'o', 'o'] 23 'o' 1331 1349 370 391 7 ['s', 'o', 'o'] 24 'h' 1686 1703 315 339 8 ['h', 't', 't'] 25 't' 169 190 1291 1312 8 ['h', 't', 't'] 26 't' 169 190 1291 1312 8 ['h', 't', 't'] 27 'n' 1331 1349 370 391 9 ['n', 'c', 'c'] 28 'c' 296 305 1492 1516 9 ['n', 'c', 'c'] 29 'c' 296 305 1492 1516 9 ['n', 'c', 'c'] 

Comments
Post a Comment