I'm working on MariaDB 10.6.20 and trying to extract valid 10-digit phone numbers (leading 1 is removed) from source columns with bad formatting. I am extracting phone numbers from columns col1, col2, col3, col4 and saving the result in col5.
Here is my DB with sample source and result data:
CREATE TABLE `info_table` (
`id` int(11) NOT NULL,
`col1` text DEFAULT NULL,
`col2` text DEFAULT NULL,
`col3` text DEFAULT NULL,
`col4` text DEFAULT NULL,
`col5` varchar(255) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
INSERT INTO `info_table` (`id`, `col1`, `col2`, `col3`, `col4`, `col5`) VALUES
(1, '301', '7725', '1(301)775', '1(301) 258-7757 EXT127\"', '3012587757'),
(2, ' \"1(301) 254-8721\"', NULL, 'Direct:5152051266', '8662471186', '3012548721,5152051266,8662471186'),
(3, '650.3585100Ext.128', '1497005', '813-832-3985', '(888) 822-7434', '6503585100,8138323985,8888227434'),
(4, '772-778-2131 x07/06', '248.557-1200 Ext', '407¿5901433', '<strong>Office:<-strong>', '7727782131,2485571200,4075901433'),
(5, '954%560%4373', '727=-303-0889', ' ', '$4$-415-3575', '9545604373,7273030889'),
(6, '100090', NULL, ' \"1(301) 254-8721', NULL, '3012548721'),
(7, '', NULL, 'error-404', '330-2225544a', '3302225544'),
(8, '+1,324, 439,9114', '22290044-33', ' ', '...', '3244399114,2229004433'),
(9, '2,563,310,074', '20,575,566', '(334)794-0328 E', '205-655-8877 x2', '2563310074,3347940328,2056558877'),
(10, '251-981-7470 x7487', '205-326-3000 x414-09', '[]', NULL, '2519817470,2053263000'),
(11, '(877) 890-0050|(256) 373-3450', '(256) 882-2223|(866) 218-0196|(256) 990-7653', NULL, '333 ext', '8778900050,2568822223'),
(12, '(256) 302-7611, (205) 661-0811, (256) 302-7611, (877) 371-9353, (877) 371-9353', '1(907) 252-2743\", \"1(907) 252-2743\", \"1(907) 252-2743', NULL, '555-444(3322)', '2563027611,9072522743,5554443322'),
(13, 'Cellphone:515250', '1(650) 291-0470\", \"1(650) 291-0470\", \"1(DRE) :01-7482', '813-832-3985', '813-839-3463 x#', '6502910470,8138323985,8138393463'),
(14, '989-686-3400 x#133', '248.557-1200 Ext', '305*335-0950', '561-859+5692', '9896863400,2485571200,3053350950,5618595692'),
(15, '573000000000', '239-472-5147`', '205.391-9903', '555-400-2030-6009008020-3338887744', '5730000000,2394725147,2053919903,5554002030');
and here is my SQL query:
UPDATE info_table AS s
JOIN (
SELECT t.id,
NULLIF(
GROUP_CONCAT(
DISTINCT SUBSTRING(
REGEXP_REPLACE(TRIM(REGEXP_REPLACE(phone_number, '[^0-9]+$', '')), '[^0-9]', ''),
CASE
WHEN LEFT(
REGEXP_REPLACE(
TRIM(REGEXP_REPLACE(phone_number, '[^0-9]+$', '')), '[^0-9]', ''
), 1
) = '1' THEN 2 ELSE 1
END,
10
) ORDER BY t.row_num, t.num_order SEPARATOR ','
), ''
) AS extracted_phones
FROM (
SELECT id,
SUBSTRING_INDEX(SUBSTRING_INDEX(phone_data, split_char, numbers.n), split_char, -1) AS phone_number,
row_num,
numbers.n AS num_order
FROM (
SELECT id, col1 AS phone_data, 1 AS row_num FROM info_table WHERE col1 IS NOT NULL
UNION ALL
SELECT id, col2, 2 FROM info_table WHERE col2 IS NOT NULL
UNION ALL
SELECT id, col3, 3 FROM info_table WHERE col3 IS NOT NULL
UNION ALL
SELECT id, col4, 4 FROM info_table WHERE col4 IS NOT NULL
) AS expanded_columns
JOIN (
SELECT '[^0-9]+' AS split_char
) AS split_chars
ON 1 = 1
JOIN (
SELECT 1 AS n UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5
UNION ALL SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL SELECT 9 UNION ALL SELECT 10
) AS numbers
ON CHAR_LENGTH(phone_data) - CHAR_LENGTH(REPLACE(phone_data, split_char, '')) >= numbers.n - 1
) AS t
WHERE REGEXP_REPLACE(TRIM(REGEXP_REPLACE(phone_number, '[^0-9]+$', '')), '[^0-9]', '') REGEXP '1?([0-9]{10})'
GROUP BY t.id
) AS extracted
ON s.id = extracted.id
SET s.col5 = extracted.extracted_phones;
The query correctly extracts valid phone numbers from column entries where there is only one single phone number, but when there are multiple phone numbers in one column such as row 11, 12 and 15 in the sample dataset, it extracts only the first phone number from those columns.
Example: (877) 890-0050|(256) 373-3450 is extracted as 8778900050, it should be 8778900050,2563733450
555-400-2030-6009008020-3338887744 is extracted as 5554002030, it should be 5554002030,6009008020,3338887744
Any ideas how to modify the code to get multiple phone numbers from the same column correctly? Thanks!
I'm working on MariaDB 10.6.20 and trying to extract valid 10-digit phone numbers (leading 1 is removed) from source columns with bad formatting. I am extracting phone numbers from columns col1, col2, col3, col4 and saving the result in col5.
Here is my DB with sample source and result data:
CREATE TABLE `info_table` (
`id` int(11) NOT NULL,
`col1` text DEFAULT NULL,
`col2` text DEFAULT NULL,
`col3` text DEFAULT NULL,
`col4` text DEFAULT NULL,
`col5` varchar(255) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
INSERT INTO `info_table` (`id`, `col1`, `col2`, `col3`, `col4`, `col5`) VALUES
(1, '301', '7725', '1(301)775', '1(301) 258-7757 EXT127\"', '3012587757'),
(2, ' \"1(301) 254-8721\"', NULL, 'Direct:5152051266', '8662471186', '3012548721,5152051266,8662471186'),
(3, '650.3585100Ext.128', '1497005', '813-832-3985', '(888) 822-7434', '6503585100,8138323985,8888227434'),
(4, '772-778-2131 x07/06', '248.557-1200 Ext', '407¿5901433', '<strong>Office:<-strong>', '7727782131,2485571200,4075901433'),
(5, '954%560%4373', '727=-303-0889', ' ', '$4$-415-3575', '9545604373,7273030889'),
(6, '100090', NULL, ' \"1(301) 254-8721', NULL, '3012548721'),
(7, '', NULL, 'error-404', '330-2225544a', '3302225544'),
(8, '+1,324, 439,9114', '22290044-33', ' ', '...', '3244399114,2229004433'),
(9, '2,563,310,074', '20,575,566', '(334)794-0328 E', '205-655-8877 x2', '2563310074,3347940328,2056558877'),
(10, '251-981-7470 x7487', '205-326-3000 x414-09', '[]', NULL, '2519817470,2053263000'),
(11, '(877) 890-0050|(256) 373-3450', '(256) 882-2223|(866) 218-0196|(256) 990-7653', NULL, '333 ext', '8778900050,2568822223'),
(12, '(256) 302-7611, (205) 661-0811, (256) 302-7611, (877) 371-9353, (877) 371-9353', '1(907) 252-2743\", \"1(907) 252-2743\", \"1(907) 252-2743', NULL, '555-444(3322)', '2563027611,9072522743,5554443322'),
(13, 'Cellphone:515250', '1(650) 291-0470\", \"1(650) 291-0470\", \"1(DRE) :01-7482', '813-832-3985', '813-839-3463 x#', '6502910470,8138323985,8138393463'),
(14, '989-686-3400 x#133', '248.557-1200 Ext', '305*335-0950', '561-859+5692', '9896863400,2485571200,3053350950,5618595692'),
(15, '573000000000', '239-472-5147`', '205.391-9903', '555-400-2030-6009008020-3338887744', '5730000000,2394725147,2053919903,5554002030');
and here is my SQL query:
UPDATE info_table AS s
JOIN (
SELECT t.id,
NULLIF(
GROUP_CONCAT(
DISTINCT SUBSTRING(
REGEXP_REPLACE(TRIM(REGEXP_REPLACE(phone_number, '[^0-9]+$', '')), '[^0-9]', ''),
CASE
WHEN LEFT(
REGEXP_REPLACE(
TRIM(REGEXP_REPLACE(phone_number, '[^0-9]+$', '')), '[^0-9]', ''
), 1
) = '1' THEN 2 ELSE 1
END,
10
) ORDER BY t.row_num, t.num_order SEPARATOR ','
), ''
) AS extracted_phones
FROM (
SELECT id,
SUBSTRING_INDEX(SUBSTRING_INDEX(phone_data, split_char, numbers.n), split_char, -1) AS phone_number,
row_num,
numbers.n AS num_order
FROM (
SELECT id, col1 AS phone_data, 1 AS row_num FROM info_table WHERE col1 IS NOT NULL
UNION ALL
SELECT id, col2, 2 FROM info_table WHERE col2 IS NOT NULL
UNION ALL
SELECT id, col3, 3 FROM info_table WHERE col3 IS NOT NULL
UNION ALL
SELECT id, col4, 4 FROM info_table WHERE col4 IS NOT NULL
) AS expanded_columns
JOIN (
SELECT '[^0-9]+' AS split_char
) AS split_chars
ON 1 = 1
JOIN (
SELECT 1 AS n UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5
UNION ALL SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL SELECT 9 UNION ALL SELECT 10
) AS numbers
ON CHAR_LENGTH(phone_data) - CHAR_LENGTH(REPLACE(phone_data, split_char, '')) >= numbers.n - 1
) AS t
WHERE REGEXP_REPLACE(TRIM(REGEXP_REPLACE(phone_number, '[^0-9]+$', '')), '[^0-9]', '') REGEXP '1?([0-9]{10})'
GROUP BY t.id
) AS extracted
ON s.id = extracted.id
SET s.col5 = extracted.extracted_phones;
The query correctly extracts valid phone numbers from column entries where there is only one single phone number, but when there are multiple phone numbers in one column such as row 11, 12 and 15 in the sample dataset, it extracts only the first phone number from those columns.
Example: (877) 890-0050|(256) 373-3450 is extracted as 8778900050, it should be 8778900050,2563733450
555-400-2030-6009008020-3338887744 is extracted as 5554002030, it should be 5554002030,6009008020,3338887744
Any ideas how to modify the code to get multiple phone numbers from the same column correctly? Thanks!
Share Improve this question edited Feb 15 at 17:56 yenren asked Feb 15 at 12:34 yenrenyenren 49810 silver badges21 bronze badges 10- Is there all phone numbers 10-digit? – ValNik Commented Feb 15 at 12:58
- @ValNik Yes, all numbers 10-digit. Leading 1 is removed. – yenren Commented Feb 15 at 13:00
- This is more of a cleanup job from what I can see, and it maybe should be done before you bring your data into MySQL. – Tim Biegeleisen Commented Feb 15 at 13:04
- @TimBiegeleisen I agree, though the DB is already filled and I have to work on it as is. – yenren Commented Feb 15 at 13:28
- 1 Please please please read this before you go too far down this application design path. Falsehoods Programmers Believe About Phone Numbers. – O. Jones Commented Feb 16 at 14:34
1 Answer
Reset to default 0I will consider only a part of the query - a select data sample.
Updating the table is pretty clear.
- Expand columns to rows ->
phone_data
.Column namerow_num
in your query changed tocol_num
. - clear the
phone_data
: we replace
'+1'->''
'1(' ->''
'(1' ->''
'[^0-9]' -> '' - delete all non-numeric symbols - For phone_data (dataN), containing long strings, divide to parts by 10 symbols.
substring(dataN,(part_num-1)*10+1,10) part
There we use values table, because can not use cross join lateral. For example max 8 parts in 1 column
nn(part_num) AS ( VALUES (1),(2),(3),(4),(5),(6),(7),(8) )
- Aggregate back parts with length=10 -
new_data
. Parts with length<10 aggregated to ``rest_data`. I will omit DISTINCT for this example.
See example
Query
with expanded_columns as(
SELECT id, col1 AS phone_data, 1 AS col_num FROM info_table WHERE col1 IS NOT NULL
UNION ALL
SELECT id, col2, 2 FROM info_table WHERE col2 IS NOT NULL
UNION ALL
SELECT id, col3, 3 FROM info_table WHERE col3 IS NOT NULL
UNION ALL
SELECT id, col4, 4 FROM info_table WHERE col4 IS NOT NULL
)
,nn(part_num) AS ( VALUES (1),(2),(3),(4),(5),(6),(7),(8) )
,data_parts as(
select *
,substring(dataN,(part_num-1)*10+1,10) part
from(
select id,col_num,phone_data
,regexp_replace(replace(replace(replace(phone_data
,'+1','')
,'1(','')
,'(1','')
, '[^0-9]', ''
) dataN
from expanded_columns
)a
left join nn on part_num<=(length(dataN)+9)/10
)
select id
,replace(
group_concat(case when length(part)=10 then part end order by col_num,part_num separator',')
,',,',',') new_data
,replace(
group_concat(case when length(part)<10 then part end order by col_num,part_num separator',')
,',,',',') rest_data
from data_parts
group by id
order by id
With source data as
id | col1 | col2 | col3 | col4 |
---|---|---|---|---|
1 | 301 | 7725 | 1(301)775 | 1(301) 258-7757 EXT127" |
2 | "1(301) 254-8721" | null | Direct:5152051266 | 8662471186 |
3 | 650.3585100Ext.128 | 1497005 | 813-832-3985 | (888) 822-7434 |
4 | 772-778-2131 x07/06 | 248.557-1200 Ext | 407¿5901433 | <strong>Office:<-strong> |
5 | 954%560%4373 | 727=-303-0889 | \ | $4$-415-3575 |
6 | 100090 | null | "1(301) 254-8721 | null |
7 | null | error-404 | 330-2225544a | |
8 | +1,324, 439,9114 | 22290044-33 | ... | |
9 | 2,563,310,074 | 20,575,566 | (334)794-0328 E | 205-655-8877 x2 |
10 | 251-981-7470 x7487 | 205-326-3000 x414-09 | [] | null |
11 | (877) 890-0050|(256) 373-3450 | (256) 882-2223|(866) 218-0196|(256) 990-7653 | null | 333 ext |
12 | (256) 302-7611, (205) 661-0811, (256) 302-7611, (877) 371-9353, (877) 371-9353 | 1(907) 252-2743", "1(907) 252-2743", "1(907) 252-2743 | null | 555-444(3322) |
13 | Cellphone:515250 | 1(650) 291-0470", "1(650) 291-0470", "1(DRE) :01-7482 | 813-832-3985 | 813-839-3463 x# |
14 | 989-686-3400 x#133 | 248.557-1200 Ext | 305*335-0950 | 561-859+5692 |
15 | 573000000000 | 239-472-5147` | 205.391-9903 | 555-400-2030-6009008020-3338887744 |
Output is
id | new_data | rest_data |
---|---|---|
1 | 3012587757 | 301,7725,301775,127 |
2 | 3012548721,5152051266,8662471186 | null |
3 | 6503585100,8138323985,8888227434 | 128,1497005 |
4 | 7727782131,2485571200,4075901433 | 0706 |
5 | 9545604373,7273030889 | 44153575 |
6 | 3012548721 | 100090 |
7 | 3302225544 | 404 |
8 | 3244399114,2229004433 | null |
9 | 2563310074,3347940328,2056558877 | 20575566,2 |
10 | 2519817470,2053263000 | 7487,41409 |
11 | 8778900050,2563733450,2568822223,8662180196,2569907653 | 333 |
12 | 2563027611,2056610811,2563027611,8773719353,8773719353 ,9072522743,9072522743,9072522743,5554443322 | null |
13 | 6502910470,6502910470,8138323985,8138393463 | 515250,017482 |
14 | 9896863400,2485571200,3053350950,5618595692 | 133 |
15 | 5730000000,2394725147,2053919903,5554002030,6009008020,3338887744 | 00 |
fiddle
Update1
As pointed in comments by @Luuk, UPDATE
UPDATE info_table AS s
JOIN (
with expanded_columns as(
SELECT id, col1 AS phone_data, 1 AS col_num FROM info_table WHERE col1 IS NOT NULL
UNION ALL
SELECT id, col2, 2 FROM info_table WHERE col2 IS NOT NULL
UNION ALL
SELECT id, col3, 3 FROM info_table WHERE col3 IS NOT NULL
UNION ALL
SELECT id, col4, 4 FROM info_table WHERE col4 IS NOT NULL
)
,nn(part_num) AS ( VALUES (1),(2),(3),(4),(5),(6),(7),(8) )
,data_parts as(
select *
,substring(dataN,(part_num-1)*10+1,10) part
from(
select id,col_num,phone_data
,regexp_replace(replace(replace(phone_data,'+1',''),'1(',''), '[^0-9]', '') dataN
from expanded_columns
)a
left join nn on part_num<=(length(dataN)+9)/10
)
select id
,replace(
group_concat(DISTINCT case when length(part)=10 then part end order by col_num,part_num separator',')
,',,',',') extracted_phones -- new_data
from data_parts
group by id
) AS extracted
ON s.id = extracted.id
SET s.col5 = extracted.extracted_phones;
DISTINCT - as you use, in GROUP_CONCAT(DISTINCT ...)
fiddle