在SQL表中查找重复值

Finding duplicate values in a SQL table

用一个字段很容易找到duplicates

1
2
3
4
SELECT name, COUNT(email)
FROM users
GROUP BY email
HAVING COUNT(email) > 1

所以如果我们有桌子

1
2
3
4
5
6
ID   NAME   EMAIL
1    John   asd@asd.com
2    Sam    asd@asd.com
3    Tom    asd@asd.com
4    Bob    bob@asd.com
5    Tom    asd@asd.com

这个问题会给我们约翰、山姆、汤姆、汤姆,因为他们都有相同的email

不过,我想用同一个emailname复制。

也就是说,我想要"汤姆","汤姆"。

我需要这样做的原因是:我犯了一个错误,允许插入重复的nameemail值。现在我需要删除/更改副本,所以我需要先找到它们。


1
2
3
4
5
6
7
8
SELECT
    name, email, COUNT(*)
FROM
    users
GROUP BY
    name, email
HAVING
    COUNT(*) > 1

只需对两列进行分组。

注意:旧的ansi标准将在group by中包含所有非聚合列,但这随着"功能依赖"的概念而改变:

In relational database theory, a functional dependency is a constraint between two sets of attributes in a relation from a database. In other words, functional dependency is a constraint that describes the relationship between attributes in a relation.

支持不一致:

  • 最近的PostgreSQL支持它。
  • SQL Server(与SQL Server 2017相同)仍然需要group by中的所有非聚合列。
  • mysql不可预测,需要sql_mode=only_full_group_by
    • 按同一顺序分组,显示错误结果;
    • 在没有任何()的情况下,这是最便宜的聚合函数(参见接受答案中的注释)。
  • 甲骨文不够主流(警告:幽默,我不知道甲骨文)。


试试这个:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
DECLARE @YourTable TABLE (id INT, name VARCHAR(10), email VARCHAR(50))

INSERT @YourTable VALUES (1,'John','John-email')
INSERT @YourTable VALUES (2,'John','John-email')
INSERT @YourTable VALUES (3,'fred','John-email')
INSERT @YourTable VALUES (4,'fred','fred-email')
INSERT @YourTable VALUES (5,'sam','sam-email')
INSERT @YourTable VALUES (6,'sam','sam-email')

SELECT
    name,email, COUNT(*) AS CountOf
    FROM @YourTable
    GROUP BY name,email
    HAVING COUNT(*)>1

输出:

1
2
3
4
5
6
name       email       CountOf
---------- ----------- -----------
John       John-email  2
sam        sam-email   2

(2 ROW(s) affected)

如果需要DUP的ID,请使用:

1
2
3
4
5
6
7
8
9
SELECT
    y.id,y.name,y.email
    FROM @YourTable y
        INNER JOIN (SELECT
                        name,email, COUNT(*) AS CountOf
                        FROM @YourTable
                        GROUP BY name,email
                        HAVING COUNT(*)>1
                    ) dt ON y.name=dt.name AND y.email=dt.email

输出:

1
2
3
4
5
6
7
8
id          name       email
----------- ---------- ------------
1           John       John-email
2           John       John-email
5           sam        sam-email
6           sam        sam-email

(4 ROW(s) affected)

要删除重复项,请尝试:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
DELETE d
    FROM @YourTable d
        INNER JOIN (SELECT
                        y.id,y.name,y.email,ROW_NUMBER() OVER(PARTITION BY y.name,y.email ORDER BY y.name,y.email,y.id) AS RowRank
                        FROM @YourTable y
                            INNER JOIN (SELECT
                                            name,email, COUNT(*) AS CountOf
                                            FROM @YourTable
                                            GROUP BY name,email
                                            HAVING COUNT(*)>1
                                        ) dt ON y.name=dt.name AND y.email=dt.email
                   ) dt2 ON d.id=dt2.id
        WHERE dt2.RowRank!=1
SELECT * FROM @YourTable

输出:

1
2
3
4
5
6
7
8
id          name       email
----------- ---------- --------------
1           John       John-email
3           fred       John-email
4           fred       fred-email
5           sam        sam-email

(4 ROW(s) affected)

试试这个:

1
2
3
4
SELECT name, email
FROM users
GROUP BY name, email
HAVING ( COUNT(*) > 1 )

如果要删除重复项,这里有一种比在三个子选择中查找偶数/奇数行更简单的方法:

1
2
3
SELECT id, name, email
FROM users u, users u2
WHERE u.name = u2.name AND u.email = u2.email AND u.id > u2.id

因此删除:

1
2
3
4
5
6
DELETE FROM users
WHERE id IN (
    SELECT id/*, name, email*/
    FROM users u, users u2
    WHERE u.name = u2.name AND u.email = u2.email AND u.id > u2.id
)

更容易阅读和理解imho

注意:唯一的问题是,您必须执行请求,直到没有删除任何行,因为每次只删除一个副本


请尝试以下操作:

1
2
3
4
5
6
SELECT * FROM
(
    SELECT Id, Name, Age, Comments, ROW_NUMBER() OVER(PARTITION BY Name, Age ORDER BY Name)
        AS Rank
        FROM Customers
) AS B WHERE Rank>1


1
2
3
4
5
6
 SELECT name, email
    FROM users
    WHERE email IN
    (SELECT email FROM users
    GROUP BY email
    HAVING COUNT(*)>1)

参加派对晚了一点,但我找到了一个很酷的解决方法来查找所有重复的ID:

1
2
3
4
SELECT GROUP_CONCAT( id )
FROM users
GROUP BY email
HAVING ( COUNT(email) > 1 )


试试这个代码

1
2
3
4
5
WITH CTE AS

( SELECT Id, Name, Age, Comments, RN = ROW_NUMBER()OVER(PARTITION BY Name,Age ORDER BY ccn)
FROM ccnmaster )
SELECT * FROM CTE

如果您使用Oracle,最好使用这种方式:

1
2
3
4
5
6
7
8
9
10
11
12
13
CREATE TABLE my_users(id NUMBER, name varchar2(100), email varchar2(100));

INSERT INTO my_users VALUES (1, 'John', '[email protected]');
INSERT INTO my_users VALUES (2, 'Sam', '[email protected]');
INSERT INTO my_users VALUES (3, 'Tom', '[email protected]');
INSERT INTO my_users VALUES (4, 'Bob', '[email protected]');
INSERT INTO my_users VALUES (5, 'Tom', '[email protected]');

commit;

SELECT *
  FROM my_users
 WHERE rowid NOT IN (SELECT MIN(rowid) FROM my_users GROUP BY name, email);


这将从每个重复组中选择/删除除一个记录之外的所有重复记录。因此,删除操作将保留所有唯一记录+每组重复项中的一条记录。

选择重复项:

1
2
3
4
5
6
7
8
SELECT *
FROM TABLE
WHERE
    id NOT IN (
        SELECT MIN(id)
        FROM TABLE
        GROUP BY column1, column2
);

删除重复项:

1
2
3
4
5
6
7
DELETE FROM TABLE
WHERE
    id NOT IN (
        SELECT MIN(id)
        FROM TABLE
        GROUP BY column1, column2
);

注意大量的记录,可能会导致性能问题。


1
SELECT id,name,COUNT(*) FROM India GROUP BY Id,Name HAVING COUNT(*)>1

我们如何计算重复的值??重复2次或大于2次。数一数,而不是群体智慧。

一样简单

1
SELECT COUNT(DISTINCT col_01) FROM Table_01


如果您希望查看表中是否有重复的行,我使用下面的查询:

1
2
3
4
5
6
7
8
9
10
11
CREATE TABLE my_table(id INT, name VARCHAR(100), email VARCHAR(100));

INSERT INTO my_table VALUES (1, 'shekh', '[email protected]');
INSERT INTO my_table VALUES (1, 'shekh', '[email protected]');
INSERT INTO my_table VALUES (2, 'Aman', '[email protected]');
INSERT INTO my_table VALUES (3, 'Tom', '[email protected]');
INSERT INTO my_table VALUES (4, 'Raj', '[email protected]');


SELECT COUNT(1) AS Total_Rows FROM my_table
SELECT COUNT(1) AS Distinct_Rows FROM ( SELECT DISTINCT * FROM my_table) abc

1
2
3
4
5
6
7
8
9
10
11
 SELECT emp.ename, emp.empno, dept.loc
          FROM emp
 INNER JOIN dept
          ON dept.deptno=emp.deptno
 INNER JOIN
    (SELECT ename, COUNT(*) FROM
    emp
    GROUP BY ename, deptno
    HAVING COUNT(*) > 1)
 t ON emp.ename=t.ename ORDER BY emp.ename
/


SELECT id, COUNT(id) FROM table1 GROUP BY id HAVING COUNT(id)>1;

我认为这对于搜索特定列中的重复值是正确的。


这是我想到的最简单的事情。它使用一个公共表表达式(CTE)和一个分区窗口(我认为这些功能在SQL 2008和更高版本中)。

此示例查找具有重复名称和dob的所有学生。要检查重复性的字段放在over子句中。可以在投影中包含任何其他字段。

1
2
3
4
5
6
7
WITH cte (StudentId, Fname, LName, DOB, RowCnt)
AS (
SELECT StudentId, FirstName, LastName, DateOfBirth AS DOB, SUM(1) OVER (Partition BY FirstName, LastName, DateOfBirth) AS RowCnt
FROM tblStudent
)
SELECT * FROM CTE WHERE RowCnt > 1
ORDER BY DOB, LName

1
2
3
4
5
6
SELECT name, email
, CASE
WHEN ROW_NUMBER () OVER (partition BY name, email ORDER BY name) > 1 THEN 'Yes'
ELSE 'No'
END"duplicated ?"
FROM users


通过使用CTE,我们也可以找到类似这样的重复值

1
2
3
4
5
6
7
WITH MyCTE
AS
(
SELECT Name,EmailId,ROW_NUMBER() OVER(PARTITION BY EmailId ORDER BY id) AS Duplicate FROM [Employees]

)
SELECT * FROM MyCTE WHERE Duplicate>1

这也应该有效,也许试试看。

1
2
3
4
5
  SELECT * FROM Users a
            WHERE EXISTS (SELECT * FROM Users b
                WHERE (     a.name = b.name
                        OR  a.email = b.email)
                     AND a.ID != b.id)

如果您搜索具有某种前缀或常规更改(如邮件中的新域)的重复项,那么在这种情况下尤其有用。然后可以在这些列中使用replace()。


1
2
SELECT * FROM users u WHERE rowid = (SELECT MAX(rowid) FROM users u1 WHERE
u.email=u1.email);

如果要查找重复数据(按一个或多个标准),请选择实际行。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
WITH MYCTE AS (
    SELECT DuplicateKey1
        ,DuplicateKey2 --optional
        ,COUNT(*) X
    FROM MyTable
    GROUP BY DuplicateKey1, DuplicateKey2
    HAVING COUNT(*) > 1
)
SELECT E.*
FROM MyTable E
JOIN MYCTE cte
ON E.DuplicateKey1=cte.DuplicateKey1
    AND E.DuplicateKey2=cte.DuplicateKey2
ORDER BY E.DuplicateKey1, E.DuplicateKey2, CreatedAt

http://developer.azurewebsites.net/2014/09/better-sql-group-by-find-duplicate-data/


SELECT column_name,COUNT(*) FROM TABLE_NAME GROUP BY column1, HAVING COUNT(*) > 1;


删除名称重复的记录

1
2
3
4
5
6
7
;WITH CTE AS    
(

    SELECT ROW_NUMBER() OVER (PARTITION BY name ORDER BY name) AS T FROM     @YourTable    
)

DELETE FROM CTE WHERE T > 1

可以使用select distinct关键字除去重复项。您还可以按名称筛选,并在表中获取具有该名称的所有人。


我们可以在这里使用having来处理聚合函数,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
CREATE TABLE #TableB (id_account INT, DATA INT, [DATE] DATE)
INSERT INTO #TableB VALUES (1 ,-50, '10/20/2018'),
(1, 20, '10/09/2018'),
(2 ,-900, '10/01/2018'),
(1 ,20, '09/25/2018'),
(1 ,-100, '08/01/2018')  

SELECT id_account , DATA, COUNT(*)
FROM #TableB
GROUP BY id_account , DATA
HAVING COUNT(id_account) > 1

DROP TABLE #TableB

这里有两个字段id_account和data与count(*)一起使用。因此,它将给出所有在两列中具有一倍以上相同值的记录。

我们有一些错误的原因,我们遗漏了在SQL Server表中添加任何约束,并且这些记录已经用前端应用程序在所有列中插入了重复的记录。然后我们可以使用下面的查询从表中删除重复的查询。

1
2
3
4
SELECT DISTINCT * INTO #TemNewTable FROM #OriginalTable
TRUNCATE TABLE #OriginalTable
INSERT INTO #OriginalTable SELECT * FROM #TemNewTable
DROP TABLE #TemNewTable

在这里,我们取了原始表的所有不同记录,并删除了原始表的记录。我们再次将所有不同的值从新表插入到原始表中,然后删除新表。


从表中的重复记录中进行检查。

1
2
3
SELECT * FROM users s
WHERE rowid < any
(SELECT rowid FROM users k WHERE s.name = k.name AND s.email = k.email);

1
2
3
SELECT * FROM users s
WHERE rowid NOT IN
(SELECT MAX(rowid) FROM users k WHERE s.name = k.name AND s.email = k.email);

删除表中的重复记录。

1
2
3
DELETE FROM users s
WHERE rowid < any
(SELECT rowid FROM users k WHERE s.name = k.name AND s.email = k.email);

1
2
3
DELETE FROM users s
WHERE rowid NOT IN
(SELECT MAX(rowid) FROM users k WHERE s.name = k.name AND s.email = k.email);

如何获取表中的重复记录

1
2
 SELECT COUNT(EmpCode),EmpCode FROM tbl_Employees WHERE STATUS=1
 GROUP BY EmpCode HAVING COUNT(EmpCode) > 1