python - Get the row index of each extracted character from csv file -


i have column (second column called second_column) in csv file represents à list of characters , positions follow: column called character_position

each line of column contains list of character_position . overall l have 300 lines in column each list of character position

character_position = [['1', 1890, 1904, 486, 505, '8', 1905, 1916, 486, 507, '4', 1919, 1931, 486, 505, '1', 1935, 1947, 486, 505, '7', 1950, 1962, 486, 505, '2', 1965, 1976, 486, 505, '9', 1980, 1992, 486, 507, '6', 1995, 2007, 486, 505, '/', 2010, 2022, 484, 508, '4', 2025, 2037, 486, 505, '8', 2040, 2052, 486, 505, '3', 2057, 2067, 486, 507, '3', 2072, 2082, 486, 505, '0', 2085, 2097, 486, 507, '/', 2100, 2112, 484, 508, 'q', 2115, 2127, 486, 507, '1', 2132, 2144, 486, 505, '7', 2147, 2157, 486, 505, '9', 2162, 2174, 486, 505, '/', 2175, 2189, 484, 508, 'c', 2190, 2204, 487, 505, '4', 2207, 2219, 486, 505, '1', 2241, 2253, 486, 505, '/', 2255, 2268, 484, 508, '1', 2271, 2285, 486, 507, '5', 2288, 2297, 486, 505], ['d', 2118, 2132, 519, 535, '.', 2138, 2144, 529, 534, '2', 2150, 2162, 516, 535, '0', 2165, 2177, 516, 535, '4', 2180, 2192, 516, 534, '7', 2196, 2208, 516, 534, '0', 2210, 2223, 514, 535, '1', 2226, 2238, 516, 534, '8', 2241, 2253, 514, 534, '2', 2256, 2267, 514, 535, '4', 2270, 2282, 516, 534, '0', 2285, 2298, 514, 535]] 

each character has values : left, top, right, bottom. instance character '1' has left=1890, top=1904, right=486, bottom=505.

my file whole csv file follow :

df = pd.read_csv(filepath_or_buffer='list_characters.csv', header=none, usecols=[1], names=['character_position]) 

from file l created new csv file 5 columns :

column 1:  character, column 2 : left , column 3 : top, column 4 : right, column 5 : bottom. cols = ['char','left','top','right','bottom'] df1 = df.character_position.str.strip('[]').str.split(', ', expand=true) df1.columns = [df1.columns % 5, df1.columns // 5] df1 = df1.stack().reset_index(drop=true) df1.columns = cols df1[cols[1:]] = df1[cols[1:]].astype(int) print (df1)    char  left  top  right  bottom 0   'm'    38  104   2456    2492 1   'i'    40  102   2442     222 2   '.'   203  213    191     198 3   '3'   235  262    131    3333 4   'a'   275  347    147     239 5   'm'   363  465    145    3334 6   'a'    73   91    373     394 7   'd'    93  112    373      39 8   'd'   454  473    663     685 9   'o'   474  495    664      33 10  'a'   108  129    727     751 11  'v'   129  150    727     444 

l want add 2 other column called line_number , all_chars_in_same_row 1)line_number corresponds line example 'm' 38 104 2456 2492 extracted let line 2 2) all_chars_in_same_row corresponds (spaced) characters in same row. instance

character_position = [['1', 1890, 1904, 486, 505, '8', 1905, 1916, 486, 507, '4', 1919, 1931, 486, 505, '1', 1935, 1947, 486, 505, '7', 1950, 1962, 486, 505, '2', 1965, 1976, 486, 505, '9', 1980, 1992, 486, 507, '6', 1995, 2007, 486, 505, '/', 2010, 2022, 484, 508, '4', 2025, 2037, 486, 505, '8', 2040, 2052, 486, 505, '3', 2057, 2067, 486, 507, '3', 2072, 2082, 486, 505, '0', 2085, 2097, 486, 507, '/', 2100, 2112, 484, 508, 'q', 2115, 2127, 486, 507, '1', 2132, 2144, 486, 505, '7', 2147, 2157, 486, 505, '9', 2162, 2174, 486, 505, '/', 2175, 2189, 484, 508, 'c', 2190, 2204, 487, 505, '4', 2207, 2219, 486, 505, '1', 2241, 2253, 486, 505, '/', 2255, 2268, 484, 508, '1', 2271, 2285, 486, 507, '5', 2288, 2297, 486, 505], ['d', 2118, 2132, 519, 535, '.', 2138, 2144, 529, 534, '2', 2150, 2162, 516, 535, '0', 2165, 2177, 516, 535, '4', 2180, 2192, 516, 534, '7', 2196, 2208, 516, 534, '0', 2210, 2223, 514, 535, '1', 2226, 2238, 516, 534, '8', 2241, 2253, 514, 534, '2', 2256, 2267, 514, 535, '4', 2270, 2282, 516, 534, '0', 2285, 2298, 514, 535]] 

l '1' '8' '4' '1' '7' , on.

more formally : all_chars_in_same_row means: write character of given row in line_number column

char  left  top  right  bottom     line_number  all_chars_in_same_row 0   'm'    38  104   2456    2492   line 2  'm' '2' '5' 'g' 1   'i'    40  102   2442     222   line 4 2   '.'   203  213    191     198   line 6 3   '3'   235  262    131    3333   4   'a'   275  347    147     239 5   'm'   363  465    145    3334 6   'a'    73   91    373     394 7   'd'    93  112    373      39 8   'd'   454  473    663     685 9   'o'   474  495    664      33 10  'a'   108  129    727     751 11  'v'   129  150    727     444 

edit1:

import pandas pd df_data=pd.read_csv('/home/ahmed/internship/cnn_ocr/list_characters.csv')  df_data.shape 

(50, 3)

df_data.icol(1)    0     [['m', 38, 104, 2456, 2492, 'i', 40, 102, 2442... 1     [['.', 203, 213, 191, 198, '3', 235, 262, 131,... 2     [['a', 275, 347, 147, 239, 'm', 363, 465, 145,... 3     [['a', 73, 91, 373, 394, 'd', 93, 112, 373, 39... 4     [['d', 454, 473, 663, 685, 'o', 474, 495, 664,... 5     [['a', 108, 129, 727, 751, 'v', 129, 150, 727,... 6     [['n', 34, 51, 949, 970, '/', 52, 61, 948, 970... 7     [['s', 1368, 1401, 43, 85, 'a', 1406, 1446, 43... 8     [['s', 1437, 1457, 112, 138, 'o', 1458, 1476, ... 9     [['h', 1686, 1703, 315, 339, 't', 1706, 1715, ... 10    [['n', 1331, 1349, 370, 391, 'c', 1361, 1379, ... 11    [['n', 1758, 1775, 370, 391, 'd', 1785, 1803, ... 12    [['d', 2166, 2184, 370, 391, 'a', 2186, 2205, ... 13    [['2', 1395, 1415, 427, 454, '0', 1416, 1434, ... 14    [['i', 1533, 1545, 487, 541, 'i', 1548, 1551, ... 15    [['p', 1659, 1677, 490, 514, '2', 1680, 1697, ... 16    [['1', 1890, 1904, 486, 505, '8', 1905, 1916, ... 17    [['b', 1344, 1361, 583, 607, 'o', 1364, 1386, ... 18    [['b', 1548, 1580, 979, 1015, 't', 1586, 1619,... 19    [['q', 169, 190, 1291, 1312, 'u', 192, 210, 12... 20    [['1', 296, 305, 1492, 1516, 's', 339, 357, 14... 21    [['g', 339, 362, 1815, 1840, 's', 365, 384, 18... 22    [['2', 1440, 1455, 2047, 2073, '9', 1458, 1475... 23    [['r', 339, 360, 2137, 2163, 'e', 363, 378, 21... 24    [['r', 339, 360, 1860, 1885, 'e', 363, 380, 18... 25    [['0', 1266, 1283, 1951, 1977, ',', 1287, 1290... 26    [['1', 2207, 2217, 1492, 1515, '0', 2225, 2240... 27    [['1', 2364, 2382, 1552, 1585], [], ['e', 2369... 28                      [['s', 2369, 2382, 1833, 1866]] 29    [['0', 2243, 2259, 1951, 1977, '0', 2271, 2288... 30    [['0', 2243, 2259, 2227, 2253, '0', 2271, 2286... 31    [['d', 76, 88, 2580, 2596, 'é', 91, 100, 2580,... 32    [['ü', 1474, 1489, 2586, 2616, '3', 1541, 1557... 33    [['e', 1440, 1461, 2670, 2697, 'u', 1466, 1488... 34    [['2', 1685, 1703, 2670, 2697, '.', 1707, 1712... 35    [['1', 2202, 2213, 2668, 2695, '3', 2220, 2237... 36                         [['c', 88, 118, 2872, 2902]] 37    [['n', 127, 144, 2889, 2910, 'd', 156, 175, 28... 38    [['e', 108, 129, 3144, 3172, 'c', 133, 156, 31... 39    [['5', 108, 126, 3204, 3231, '0', 129, 147, 32... 40                                                 [[]] 41    [['1', 480, 492, 3202, 3229, '6', 500, 518, 32... 42    [['p', 217, 234, 3337, 3360, 'a', 235, 255, 33... 43                                                 [[]] 44    [['i', 954, 963, 2892, 2934, 'm', 969, 1011, 2... 45    [['e', 1385, 1407, 2970, 2998, 'u', 1410, 1433... 46    [['t', 2067, 2084, 2889, 2911, 'o', 2088, 2106... 47    [['1', 2201, 2213, 2970, 2997, '6', 2219, 2238... 48    [['m', 1734, 1755, 3246, 3267, 'o', 1758, 1779... 49    [['l', 923, 935, 3411, 3430, 'a', 941, 957, 34... name: character_position, dtype: object 

then in char.csv l following

    df = pd.read_csv('list_characters.csv', header=none, usecols=[1], names=['character_position'])     df = df.replace(['\[','\]'], ['',''], regex=true)     cols = ['char','left','right','top','bottom'] df1 = df.positionlrtb.str.strip('[]').str.split(',', expand=true) df1.columns = [df1.columns % 5, df1.columns // 5] df1 = df1.stack().reset_index(drop=true) df1.columns = cols df1.char = df1.char.replace(['\[','\]'], ['',''], regex=true) df1['left']=df1['left'].replace(['\[','\]'], ['',''], regex=true) df1['top']=df1['top'].replace(['\[','\]'], ['',''], regex=true) df1['right']=df1['right'].replace(['\[','\]'], ['',''], regex=true) df1['bottom']=df1['bottom'].replace(['\[','\]'], ['',''], regex=true) df1.to_csv('chars.csv') 

however l don't see in response how added columns from_line , all_char_in_same_rows.

when l execute line of code :

df_data = df_data.character_position.str.strip('[]').str.split(',', expand=true) 

l following :

df_data[0:10]   0      1      2      3      4     5      6      7      8      9     ...   \ 0  'm'     38    104   2456   2492   'i'     40    102   2442   2448  ...     1  '.'    203    213    191    198   '3'    235    262    131    198  ...     2  'a'    275    347    147    239   'm'    363    465    145    239  ...     3  'a'     73     91    373    394   'd'     93    112    373    396  ...     4  'd'    454    473    663    685   'o'    474    495    664    687  ...     5  'a'    108    129    727    751   'v'    129    150    727    753  ...     6  'n'     34     51    949    970   '/'     52     61    948    970  ...     7  's'   1368   1401     43     85   'a'   1406   1446     43     85  ...     8  's'   1437   1457    112    138   'o'   1458   1476    118    138  ...     9  'h'   1686   1703    315    339   't'   1706   1715    316    339  ...        1821  1822  1823  1824  1825  1826  1827  1828  1829  1830   0  none  none  none  none  none  none  none  none  none  none   1  none  none  none  none  none  none  none  none  none  none   2  none  none  none  none  none  none  none  none  none  none   3  none  none  none  none  none  none  none  none  none  none   4  none  none  none  none  none  none  none  none  none  none   5  none  none  none  none  none  none  none  none  none  none   6  none  none  none  none  none  none  none  none  none  none   

here 10 first lines of csv file :

    character_position 0   [['m', 38, 104, 2456, 2492, 'i', 40, 102, 2442, 2448, 'i', 40, 100, 2402, 2410, 'l', 40, 102, 2372, 2382, 'm', 40, 102, 2312, 2358, 'u', 40, 102, 2292, 2310, 'i', 40, 104, 2210, 2260, 'l', 40, 104, 2180, 2208, 'i', 40, 104, 2140, 2166, 'l', 40, 104, 2124, 2134]] 1   [['.', 203, 213, 191, 198, '3', 235, 262, 131, 198]] 2   [['a', 275, 347, 147, 239, 'm', 363, 465, 145, 239, 's', 485, 549, 145, 243, 'u', 569, 631, 145, 241, 'n', 657, 733, 145, 239]] 3   [['a', 73, 91, 373, 394, 'd', 93, 112, 373, 396, 'r', 115, 133, 373, 396, 'e', 136, 153, 373, 396, 's', 156, 172, 373, 396, 's', 175, 192, 373, 396, 'e', 195, 211, 373, 396, 'd', 222, 241, 373, 396, 'e', 244, 261, 373, 396, 'l', 272, 285, 375, 396, 'i', 288, 293, 375, 396, 'v', 296, 314, 375, 396, 'r', 317, 334, 373, 396, 'a', 334, 354, 375, 396, 'i', 357, 360, 373, 396, 's', 365, 381, 373, 396, 'o', 384, 405, 373, 396, 'n', 408, 425, 373, 394]] 4   [['d', 454, 473, 663, 685, 'o', 474, 495, 664, 687, 'c', 498, 516, 664, 687, 'u', 519, 536, 663, 687, 'm', 540, 561, 663, 687, 'e', 564, 581, 663, 685, 'n', 584, 600, 664, 685, 't', 603, 618, 663, 685]] 5   [['a', 108, 129, 727, 751, 'v', 129, 150, 727, 753, 'o', 153, 175, 727, 753, 'i', 178, 183, 727, 751, 'r', 187, 210, 727, 751, 's', 220, 240, 727, 753, 'u', 243, 263, 727, 753, 'r', 267, 288, 727, 751, 'f', 302, 318, 727, 751, 'a', 320, 341, 727, 751, 'c', 342, 363, 726, 751, 't', 366, 384, 726, 750, 'u', 387, 407, 727, 751, 'r', 411, 432, 727, 751, 'e', 435, 453, 726, 751, 'p', 797, 815, 727, 751, 'a', 818, 839, 727, 751, 'g', 840, 863, 727, 751, 'e', 867, 885, 726, 751, '1', 900, 911, 727, 751, '1', 926, 934, 727, 751, '1', 947, 956, 727, 751, '5', 962, 979, 727, 751], ['r', 120, 142, 778, 807, 't', 144, 165, 778, 805, 't', 178, 199, 778, 805, 'e', 201, 219, 786, 807, 'c', 222, 240, 786, 807, 'h', 241, 258, 778, 807, 'n', 263, 279, 786, 807, 'i', 284, 287, 778, 805, 'c', 291, 308, 786, 807, 'a', 309, 327, 786, 807, 'r', 350, 374, 778, 807, 'e', 377, 395, 786, 807, 't', 396, 405, 780, 805, 'u', 408, 425, 786, 807, 'r', 429, 440, 786, 807, 'n', 441, 458, 786, 807, '-', 471, 482, 793, 798, 'd', 497, 518, 778, 807, 'o', 522, 548, 777, 807, 'a', 549, 573, 778, 807, '/', 585, 596, 778, 807, 'd', 606, 630, 778, 807, 'a', 632, 656, 778, 807, 'p', 659, 680, 778, 805]] 6   [['n', 34, 51, 949, 970, '/', 52, 61, 948, 970, 'c', 63, 81, 948, 970, 'o', 84, 103, 948, 970, 'm', 106, 127, 949, 970, 'm', 130, 151, 948, 970, 'a', 153, 172, 949, 970, 'n', 175, 192, 949, 970, 'd', 195, 213, 948, 970, 'e', 217, 232, 948, 970], ['1', 73, 84, 993, 1020, '1', 94, 105, 993, 1020, '8', 112, 130, 991, 1020, '4', 135, 153, 993, 1018, '5', 156, 172, 994, 1018, '7', 175, 192, 993, 1018, '6', 195, 213, 993, 1020, '0', 216, 235, 991, 1020, '6', 238, 257, 993, 1020, '5', 260, 278, 993, 1020, '0', 407, 425, 991, 1020, '9', 428, 446, 991, 1020, '.', 450, 455, 1015, 1020, '0', 459, 477, 991, 1020, '1', 485, 494, 994, 1018, '.', 503, 507, 1015, 1020, '2', 512, 530, 991, 1020, '0', 533, 551, 991, 1020, '1', 555, 566, 993, 1020, '5', 575, 593, 993, 1020, 'r', 632, 656, 991, 1020, 'm', 659, 684, 991, 1020, 'a', 689, 713, 991, 1020, 'n', 726, 747, 993, 1020, 'o', 752, 770, 999, 1020, '.', 774, 779, 1015, 1020, '5', 794, 812, 993, 1020, '8', 815, 833, 991, 1020, '4', 834, 852, 993, 1017, '4', 857, 873, 994, 1018, '3', 878, 896, 991, 1020, '8', 899, 917, 991, 1020, '0', 920, 938, 991, 1020, '/', 950, 960, 991, 1020, '0', 971, 990, 993, 1020, '7', 995, 1011, 993, 1018, '1', 1016, 1026, 993, 1018, '6', 1034, 1052, 993, 1020, '7', 1055, 1073, 993, 1020, '4', 1076, 1094, 993, 1018, '8', 1098, 1116, 991, 1020, '9', 1119, 1137, 991, 1020, '0', 1140, 1158, 993, 1020, '9', 1160, 1178, 991, 1020], ['n', 34, 51, 1045, 1066, '/', 54, 61, 1045, 1066, 'b', 63, 79, 1044, 1066, 'o', 82, 102, 1044, 1066, 'n', 105, 121, 1045, 1066, 'd', 133, 151, 1045, 1066, 'e', 156, 172, 1044, 1066, 'l', 183, 196, 1045, 1066, 'i', 199, 204, 1045, 1066, 'v', 205, 223, 1045, 1066, 'r', 226, 244, 1045, 1066, 'a', 246, 266, 1045, 1066, 'i', 267, 272, 1045, 1066, 's', 275, 291, 1044, 1066, 'o', 294, 314, 1045, 1066, 'n', 318, 335, 1045, 1066], ['8', 72, 90, 1093, 1122, '2', 93, 109, 1093, 1122, '5', 114, 132, 1095, 1122, '9', 135, 153, 1093, 1122, '7', 154, 172, 1095, 1122, '1', 178, 189, 1093, 1122, '3', 196, 214, 1093, 1122, '1', 220, 231, 1095, 1122, '0', 238, 257, 1093, 1122, '3', 260, 278, 1093, 1122, '0', 407, 425, 1093, 1122, '6', 429, 447, 1095, 1122, '.', 452, 455, 1117, 1122, '0', 459, 477, 1093, 1122, '2', 480, 498, 1093, 1122, '.', 503, 507, 1117, 1122, '2', 512, 530, 1093, 1122, '0', 533, 551, 1093, 1122, '1', 557, 567, 1095, 1122, '5', 575, 593, 1095, 1122], ['v', 70, 90, 1150, 1171, '/', 88, 97, 1150, 1171, 'r', 100, 118, 1150, 1171, 'é', 121, 136, 1144, 1173, 'f', 141, 156, 1150, 1171, 'ê', 159, 174, 1144, 1173, 'r', 177, 195, 1150, 1173, 'e', 198, 214, 1150, 1171, 'n', 217, 234, 1150, 1171, 'c', 238, 257, 1149, 1171, 'e', 260, 276, 1149, 1173, 'b', 476, 497, 1152, 1179, 'o', 501, 527, 1149, 1179, 'g', 530, 555, 1150, 1180, 'd', 560, 582, 1152, 1179, 'o', 585, 611, 1149, 1179, 'a', 614, 638, 1150, 1179, '1', 642, 653, 1152, 1179, '5', 659, 677, 1153, 1180, 'b', 681, 701, 1152, 1179, 't', 705, 726, 1152, 1179, '0', 728, 746, 1152, 1179, '6', 749, 767, 1152, 1179]] 7   [['s', 1368, 1401, 43, 85, 'a', 1406, 1446, 43, 85, 'm', 1451, 1491, 36, 85, 's', 1500, 1533, 43, 85, 'u', 1539, 1574, 43, 85, 'n', 1581, 1616, 43, 85, 'g', 1623, 1662, 42, 85, 'e', 1686, 1719, 43, 85, 'l', 1725, 1755, 43, 85, 'e', 1763, 1794, 42, 85, 'c', 1800, 1836, 43, 85, 't', 1841, 1874, 42, 85, 'r', 1880, 1914, 42, 84, 'o', 1919, 1959, 42, 85, 'n', 1965, 1998, 42, 84, 'i', 2007, 2016, 42, 84, 'c', 2022, 2058, 42, 84, 's', 2066, 2099, 42, 84, 'f', 2121, 2151, 42, 84, 'r', 2159, 2193, 42, 84, 'a', 2198, 2237, 40, 84, 'n', 2243, 2277, 40, 84, 'c', 2285, 2321, 42, 84, 'e', 2328, 2360, 40, 84]] 8   [['s', 1437, 1457, 112, 138, 'o', 1458, 1476, 118, 138, 'c', 1479, 1493, 120, 138, 'i', 1494, 1499, 112, 136, 'é', 1503, 1518, 114, 138, 't', 1520, 1527, 115, 138, 'é', 1530, 1547, 112, 138, 'p', 1559, 1575, 120, 144, 'a', 1577, 1593, 118, 138, 'r', 1596, 1607, 118, 136, 'a', 1616, 1637, 112, 136, 'c', 1640, 1653, 118, 138, 't', 1655, 1664, 115, 136, 'i', 1665, 1670, 112, 136, 'o', 1673, 1688, 118, 138, 'n', 1692, 1707, 118, 136, 's', 1710, 1725, 118, 138, 's', 1736, 1755, 112, 138, 'i', 1760, 1763, 112, 136, 'm', 1767, 1791, 118, 136, 'p', 1794, 1811, 118, 142, 'l', 1812, 1817, 112, 136, 'i', 1821, 1824, 112, 136, 'f', 1827, 1835, 112, 136, 'i', 1835, 1841, 112, 136, 'é', 1845, 1860, 112, 136, 'e', 1863, 1878, 118, 136, 'a', 1890, 1907, 118, 138, 'u', 1910, 1925, 118, 136, 'c', 1937, 1958, 112, 136, 'a', 1961, 1977, 118, 136, 'p', 1980, 1995, 118, 142, 'i', 1998, 2003, 112, 136, 't', 2006, 2013, 114, 136, 'a', 2015, 2030, 118, 136, 'l', 2034, 2037, 112, 136, 'd', 2051, 2066, 111, 136, 'e', 2069, 2085, 117, 136, '2', 2097, 2112, 112, 136, '7', 2115, 2132, 111, 136, '.', 2136, 2139, 132, 136, '0', 2144, 2159, 111, 136, '0', 2162, 2178, 111, 136, '0', 2180, 2196, 111, 136, '.', 2201, 2205, 132, 135, '0', 2208, 2225, 111, 136, '0', 2228, 2243, 111, 136, '0', 2246, 2261, 111, 136, 't', 2273, 2281, 112, 135, 'i', 2281, 2291, 111, 136], ['1', 1473, 1482, 153, 177, ',', 1491, 1494, 172, 181, 'r', 1508, 1517, 159, 177, 'u', 1520, 1535, 160, 177, 'e', 1538, 1554, 159, 177, 'f', 1566, 1583, 153, 177, 'r', 1587, 1596, 159, 177, 'u', 1598, 1613, 159, 177, 'c', 1617, 1631, 159, 177, 't', 1634, 1641, 154, 177, 'i', 1643, 1646, 153, 177, 'd', 1650, 1665, 151, 177, 'o', 1668, 1685, 159, 177, 'r', 1688, 1697, 159, 177, 'c', 1709, 1730, 153, 177, 's', 1733, 1751, 153, 177, '2', 1764, 1779, 153, 177, '0', 1781, 1797, 153, 177, '0', 1800, 1817, 153, 177, '3', 1820, 1835, 151, 177, '9', 1847, 1863, 151, 177, '3', 1866, 1883, 151, 177, '4', 1883, 1901, 153, 175, '8', 1904, 1919, 151, 177, '4', 1919, 1937, 153, 175, 's', 1950, 1968, 151, 177, 'a', 1971, 1992, 151, 175, 'i', 1995, 2000, 151, 175, 'n', 2004, 2024, 151, 175, 't', 2027, 2046, 151, 175, 'o', 2058, 2081, 151, 177, 'u', 2085, 2105, 151, 177, 'e', 2109, 2127, 151, 177, 'n', 2130, 2150, 151, 175, 'c', 2163, 2186, 151, 175, 'e', 2187, 2204, 157, 175, 'd', 2207, 2222, 150, 175, 'e', 2225, 2240, 157, 175, 'x', 2243, 2258, 157, 175], ['t', 1638, 1656, 192, 216, 'É', 1659, 1677, 186, 217, 'l', 1682, 1697, 193, 217, 'É', 1701, 1719, 187, 217, 'p', 1722, 1742, 192, 217, 'h', 1746, 1766, 193, 217, 'o', 1770, 1793, 192, 217, 'n', 1796, 1815, 192, 216, 'e', 1820, 1838, 192, 217, '0', 1869, 1886, 190, 216, '1', 1890, 1899, 192, 216, '4', 1914, 1931, 193, 216, '4', 1934, 1950, 193, 216, '0', 1961, 1977, 190, 216, '4', 1980, 1997, 193, 216, '7', 2009, 2024, 192, 216, '0', 2027, 2042, 192, 216, '0', 2055, 2070, 192, 216, '0', 2073, 2090, 192, 216], ['r', 1517, 1538, 232, 258, '.', 1542, 1545, 253, 256, 'c', 1550, 1571, 232, 256, '.', 1575, 1580, 252, 256, 's', 1584, 1602, 232, 256, '.', 1607, 1611, 252, 256, 'b', 1625, 1643, 232, 256, 'o', 1649, 1670, 231, 258, 'b', 1674, 1692, 232, 256, 'i', 1697, 1701, 232, 256, 'g', 1706, 1728, 232, 256, 'n', 1731, 1751, 232, 256, 'y', 1754, 1775, 232, 256, 'b', 1788, 1806, 232, 256, '3', 1818, 1835, 231, 256, '3', 1838, 1855, 231, 256, '4', 1855, 1872, 232, 255, '3', 1884, 1899, 232, 256, '6', 1904, 1919, 232, 256, '7', 1922, 1937, 232, 256, '4', 1947, 1964, 232, 256, '9', 1967, 1983, 232, 256, '7', 1986, 2001, 232, 256, '-', 2013, 2022, 244, 249, 'a', 2034, 2055, 231, 255, 'p', 2057, 2075, 231, 255, 'e', 2079, 2097, 231, 256, '4', 2109, 2126, 232, 255, '6', 2129, 2145, 232, 256, '5', 2148, 2163, 232, 256, '2', 2166, 2183, 232, 255, 'z', 2193, 2211, 231, 255], ['c', 1628, 1647, 271, 297, 'o', 1652, 1670, 279, 297, 'd', 1671, 1689, 273, 297, 'e', 1692, 1709, 279, 298, 't', 1721, 1739, 273, 297, 'v', 1742, 1763, 273, 297, 'a', 1763, 1787, 273, 297, 'f', 1818, 1835, 273, 297, 'r', 1839, 1859, 273, 297, '8', 1872, 1889, 273, 297, '9', 1890, 1905, 273, 297, '3', 1919, 1932, 273, 297, '3', 1937, 1952, 273, 297, '4', 1953, 1971, 273, 297, '3', 1983, 1998, 273, 297, '6', 2001, 2018, 273, 297, '7', 2021, 2036, 273, 295, '4', 2048, 2064, 274, 297, '9', 2066, 2082, 273, 297, '7', 2085, 2100, 273, 295]] 9   [['h', 1686, 1703, 315, 339, 't', 1706, 1715, 316, 339, 't', 1718, 1727, 316, 339, 'p', 1730, 1748, 321, 345, 'i', 1751, 1757, 321, 339, 'f', 1760, 1769, 315, 339, '/', 1769, 1776, 313, 339, 'w', 1779, 1804, 321, 337, 'w', 1804, 1829, 321, 339, 'w', 1830, 1854, 321, 337, '.', 1859, 1863, 333, 337, 's', 1868, 1883, 319, 339, 'a', 1886, 1901, 321, 337, 'm', 1905, 1929, 321, 337, 's', 1932, 1949, 321, 339, 'u', 1953, 1968, 321, 339, 'n', 1973, 1989, 321, 339, 'g', 1992, 2010, 319, 345, '.', 2015, 2019, 333, 337, 'f', 2021, 2033, 313, 337, 'r', 2034, 2045, 319, 337]] 10  [['n', 1331, 1349, 370, 391, 'c', 1361, 1379, 370, 393, 'o', 1382, 1403, 370, 393, 'm', 1404, 1425, 370, 391, 'p', 1430, 1446, 370, 391, 't', 1448, 1464, 370, 391, 'e', 1467, 1484, 370, 393, 'c', 1494, 1512, 370, 393, 'l', 1515, 1532, 370, 393, 'i', 1533, 1539, 370, 393, 'e', 1542, 1559, 370, 393, 'n', 1560, 1580, 370, 393, 't', 1580, 1598, 370, 393]] 

here second csv file:

    char    left    right   top bottom 0   'm' 38  104 2456    2492 1   'i' 40  102 2442    2448 2   'i' 40  100 2402    2410 3   'l' 40  102 2372    2382 4   'm' 40  102 2312    2358 5   'u' 40  102 2292    2310 6   'i' 40  104 2210    2260 7   'l' 40  104 2180    2208 8   'i' 40  104 2140    2166 

edit1

here output solution 2 (`input character_position described` )      1831    1830    level_2 char    left    top right   bottom  fromline    all_chars_in_same_row 0   0   character_position  0   character_position                  0   character_position 1   1   'm','i','i','l','m','u','i','l','i','l' 0   'm' 38  104 2456    2492    1   'm','i','i','l','m','u','i','l','i','l' 2   1   'm','i','i','l','m','u','i','l','i','l' 1   'i' 40  102 2442    2448    1   'm','i','i','l','m','u','i','l','i','l' 3   1   'm','i','i','l','m','u','i','l','i','l' 2   'i' 40  100 2402    2410    1   'm','i','i','l','m','u','i','l','i','l' 

l think probelm comes fact l have in data : [[',' , 'a', ',' , '.', ':' , ';', '1'], [], ['m', 'a',]] :

empty `[ ]`  causes problem order. l noticed when l tried omit  [] empty beacause l find csv follow : 

in char : ['a' rather 'a' values 8794] rather 8794 or [5345 rather 5345 processed csv follow

    df = pd.read_csv(filepath_or_buffer='lit_charaters.csv', header=none, usecols=[1,3], names=['character_position','lineindex'])     df = df.replace(['\[','\]'], ['',''], regex=true) cols = ['char','left','right','top','bottom','lineindex'] df1 = df.positionlrtb.str.strip('[]').str.split(',', expand=true) df1.columns = [df1.columns % 5, df1.columns // 5] df1 = df1.stack().reset_index(drop=true) df1.columns = cols df1.char = df1.char.replace(['\[','\]'], ['',''], regex=true) df1['left']=df1['left'].replace(['\[','\]'], ['',''], regex=true) df1['top']=df1['top'].replace(['\[','\]'], ['',''], regex=true) df1['right']=df1['right'].replace(['\[','\]'], ['',''], regex=true) df1['bottom']=df1['bottom'].replace(['\[','\]'], ['',''], regex=true) df1.to_csv('char.csv') 

enter image description here

then l noticed following

look @ line 1221 column b it's empty replaces [] disorder of columns switched (b , c) due empty char . how solve ? l have empty line

3831    '6' 296 314 3204    3231 3832                     3833    '1' 480 492 3202    3229 

line 3832 should removed.

in order thisenter image description here

**edit2:** 

in order solve problem of empty rows , [] in list_characters.csv

[['1', 2364, 2382, 1552, 1585], [], ['e', 2369, 2381, 1623, 1640], ['8', 2369, 2382, 1644, 1668]] , [[]] [[]]

l did following :

    df1 = df.applymap(lambda x: [y y in x if len(y) > 0])      df1 = df1[df1.applymap(len).ne(0).all(axis=1)]      df1 = df.replace(['\[\],','\[\[\]\]', ''],['','', np.nan], regex=true)      df1 = df1.dropna()  df = pd.read_csv('character_position.csv', index_col=0)  df.positionlrtb = df.positionlrtb.apply(ast.literal_eval)  df.positionlrtb = df.positionlrtb.apply(lambda x: [y y in x if len(y) > 0]) print (df.head())       page_number                                       positionlrtb  \ 0  1841729699_001  [[m, 38, 104, 2456, 2492, i, 40, 102, 2442, 24...    1  1841729699_001   [[., 203, 213, 191, 198, 3, 235, 262, 131, 198]]    2  1841729699_001  [[a, 275, 347, 147, 239, m, 363, 465, 145, 239...    3  1841729699_001  [[a, 73, 91, 373, 394, d, 93, 112, 373, 396, r...    4  1841729699_001  [[d, 454, 473, 663, 685, o, 474, 495, 664, 687...                         lineindex   0      [[mi, il, mu, il, il]]   1                      [[.3]]   2                   [[amsun]]   3  [[adresse, de, livraison]]   4                [[document]]  cols = ['char','left','top','right','bottom']  df1 = pd.dataframe({         "a": np.repeat(df.page_number.values, df.positionlrtb.str.len()),         "b": list(chain.from_iterable(df.positionlrtb))})  df1 = pd.dataframe(df1.b.values.tolist())     df1.columns = [df1.columns % 5, df1.columns // 5] df1 = df1.stack().reset_index(drop=true)   cols = ['char','left','top','right','bottom'] df1.columns = cols df1[cols[1:]] = df1[cols[1:]].astype(int)    print (df1)      char  left   top  right  bottom 0       m    38   104   2456    2492 1          40   102   2442    2448 2          40   100   2402    2410 3       l    40   102   2372    2382 4       m    40   102   2312    2358 5       u    40   102   2292    2310 6          40   104   2210    2260 7       l    40   104   2180    2208 8          40   104   2140    2166 

however :

df_data = df_data.character_position.str.strip('[]').str.split(', ', expand=true) 

returns none values

once create required data frame, after stacking , don't remove index, holds line number. since multilevel indexing , first index- line number.

df_data['lineindex'] = df_data.index.get_level_values(0) 

then can group lineindex column , characters common lineindex. created dictionary. convert dictionary data frame , merge actual data


solution 1


import pandas pd  df_data=pd.read_csv('list_characters.csv' , header=none, usecols=[1], names=['character_position']) df_data = df_data.character_position.str.strip('[]').str.split(', ', expand=true) df_data.columns = [df_data.columns % 5, df_data.columns // 5]  df_data = df_data.stack() # dont remove index, has line record created print  df_data  df_data['fromline'] = df_data.index.get_level_values(0) #assign line number column  cols = ['char','left','top','right','bottom','fromline'] df_data.columns = cols #assign new column names  #create new dictionary #it contains line number key , characters line value  dictchar= {k: list(v) k,v in df_data.groupby("fromline")["char"]}  #convert dictionary dataframe  df_chars=pd.dataframe(dictchar.items()) df_chars.columns=cols = ['fromline','char']  #   merge dataframes on column 'fromline' df_final=df_data.merge(df_chars,on ='fromline') cols = ['char','left','top','right','bottom','fromline','all_chars_in_same_row'] df_final.columns=cols print df_final 

solution 2


i prefer solution on first one. see inline comments more details

import pandas pd  df_data=pd.read_csv('list_characters.csv', header=none, usecols=[1], names=['character_position']) df_data = df_data.character_position.str.strip('[]').str.split(', ', expand=true)  x=len(df_data.columns) #get total number of columns  #get characters every 5th column, concatenate , create new column in df_data df_data[x] = df_data[df_data.columns[::5]].apply(lambda x: ','.join(x.dropna()), axis=1) # index of each row. line number record df_data[x+1]=df_data.index.get_level_values(0)   # set line number , character columns index of data frame df_data.set_index([x+1,x],inplace=true,drop=true)  df_data.columns = [df_data.columns % 5, df_data.columns // 5]  df_data = df_data.stack() df_data['fromline'] = df_data.index.get_level_values(0) #assign line number column df_data['all_chars_in_same_row'] = df_data.index.get_level_values(1) #assign character values column cols = ['char','left','top','right','bottom','fromline','all_chars_in_same_row'] df_data.columns=cols df_data.reset_index(inplace=true) #remove mutiindexing print df_data[cols] 

output

      char  left   top right bottom  line all_chars_in_same_row 0     '.'   203   213   191    198          0  ['.', '3', 'c'] 1     '3'  1758  1775   370    391          0  ['.', '3', 'c'] 2     'c'   296   305  1492   1516          0  ['.', '3', 'c'] 3     'a'   275   347   147    239          1  ['a', 'm', 'd'] 4     'm'  2166  2184   370    391          1  ['a', 'm', 'd'] 5     'd'   339   362  1815   1840          1  ['a', 'm', 'd'] 6     'a'    73    91   373    394          2  ['a', 'd', 'a'] 7     'd'  1395  1415   427    454          2  ['a', 'd', 'a'] 8     'a'  1440  1455  2047   2073          2  ['a', 'd', 'a'] 9     'd'   454   473   663    685          3  ['d', 'o', '0'] 10    'o'  1533  1545   487    541          3  ['d', 'o', '0'] 11    '0'   339   360  2137   2163          3  ['d', 'o', '0'] 12    'a'   108   129   727    751          4  ['a', 'v', 'i'] 13    'v'  1659  1677   490    514          4  ['a', 'v', 'i'] 14    'i'   339   360  1860   1885          4  ['a', 'v', 'i'] 15    'n'    34    51   949    970          5  ['n', '/', '2'] 16    '/'  1890  1904   486    505          5  ['n', '/', '2'] 17    '2'  1266  1283  1951   1977          5  ['n', '/', '2'] 18    's'  1368  1401    43     85          6  ['s', 'a', '8'] 19    'a'  1344  1361   583    607          6  ['s', 'a', '8'] 20    '8'  2207  2217  1492   1515          6  ['s', 'a', '8'] 21    's'  1437  1457   112    138          7  ['s', 'o', 'o'] 22    'o'  1548  1580   979   1015          7  ['s', 'o', 'o'] 23    'o'  1331  1349   370    391          7  ['s', 'o', 'o'] 24    'h'  1686  1703   315    339          8  ['h', 't', 't'] 25    't'   169   190  1291   1312          8  ['h', 't', 't'] 26    't'   169   190  1291   1312          8  ['h', 't', 't'] 27    'n'  1331  1349   370    391          9  ['n', 'c', 'c'] 28    'c'   296   305  1492   1516          9  ['n', 'c', 'c'] 29    'c'   296   305  1492   1516          9  ['n', 'c', 'c'] 

Comments

Popular posts from this blog

inversion of control - Autofac named registration constructor injection -

verilog - Systemverilog dynamic casting issues -

ios - Change Storyboard View using Seague -