|
Filling Empty Cells in the Sparse Data Feed of Famous Actors and Actresses
By Kalman Toth, M.Phil., M.Phil.,
MCDBA, MCITP
It frequently happens that you get a data feed from an external or internal source, which is not in perfect shape for a relational database load. Therefore you have to perform the “transform” step of the Extract, Transform & Load (ETL) data warehouse population process.
Using the example of the religion(s) of famous film Actors and Actresses, the feed only contains the Name and Rank once, even if there are more than one religion lines. The reason is that the feed originates in a report where it would look funny to list the Rank and Name repeatedly. It is a common editorial practice when tabulating data in an article. The data feed source is: http://www.adherents.com/movies/adh_actors.html .
First a table is created; following the data is loaded in the original format.
The transformation step is an UPDATE applying a SELF-JOIN. The first table contains the Name and Rank, the second table holds nulls in those cells. In the UPDATE statement the Rank and Name nulls are updated from the first (left) table.
Here is the listing:
USE [AdventureWorks]
GO
-- drop table [dbo].[FamousActorsandActresses]
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
SET ANSI_PADDING ON
GO
CREATE TABLE [dbo].[FamousActorsandActresses](
[FamousActorsandActressesID] [int] NOT NULL,
[Rank] [int] NULL,
[Name] [nvarchar](70) COLLATE SQL_Latin1_General_CP1_CI_AS NULL,
[Religion] [nvarchar](50) COLLATE SQL_Latin1_General_CP1_CI_AS NULL,
[StageRole] [char](7) COLLATE SQL_Latin1_General_CP1_CI_AS NULL
) ON [PRIMARY]
GO
SET ANSI_PADDING ON
GO
insert FamousActorsandActresses values (1, 1, 'Humphrey Bogart', 'Episcopalian (lapsed)', 'actor ')
insert FamousActorsandActresses values (2, 2, 'Cary Grant', 'Anglican (lapsed); part-Jewish;', 'actor ')
insert FamousActorsandActresses values (2, null, null, 'generic spirituality as adult', 'actor ')
insert FamousActorsandActresses values (3, 3, 'Jimmy Stewart', 'Presbyterian', 'actor ')
insert FamousActorsandActresses values (4, 4, 'Marlon Brando', 'minimal Christian Science;', 'actor ')
insert FamousActorsandActresses values (4, null, null, 'Native American spirituality', 'actor ')
insert FamousActorsandActresses values (5, 5, 'Fred Astaire', 'Episcopalian', 'actor ')
insert FamousActorsandActresses values (6, 6, 'Henry Fonda', 'Christian Science (lapsed)', 'actor ')
insert FamousActorsandActresses values (7, 7, 'Clark Gable', 'Catholic', 'actor ')
insert FamousActorsandActresses values (8, 8, 'James Cagney', 'Catholic', 'actor ')
insert FamousActorsandActresses values (9, 9, 'Spencer Tracy', 'Catholic', 'actor ')
insert FamousActorsandActresses values (10, 10, 'Charlie Chaplin', 'Anglican; agnostic', 'actor ')
insert FamousActorsandActresses values (11, 11, 'Gary Cooper', 'Catholic (convert)', 'actor ')
insert FamousActorsandActresses values (12, 12, 'Gregory Peck', 'Catholic', 'actor ')
insert FamousActorsandActresses values (13, 13, 'John Wayne', 'Presbyterian; Catholic', 'actor ')
insert FamousActorsandActresses values (14, 14, 'Laurence Olivier', 'Anglican', 'actor ')
insert FamousActorsandActresses values (15, 15, 'Gene Kelly', 'Catholic', 'actor ')
insert FamousActorsandActresses values (16, 16, 'Orson Welles', 'Protestant Christian', 'actor ')
insert FamousActorsandActresses values (17, 17, 'Kirk Douglas', 'Judaism', 'actor ')
insert FamousActorsandActresses values (18, 18, 'James Dean', 'Quaker', 'actor ')
insert FamousActorsandActresses values (19, 19, 'Burt Lancaster', 'Church of the Son of Man', 'actor ')
insert FamousActorsandActresses values (19, null, null, '("non-denominational" Protestant); ACLU', 'actor ')
insert FamousActorsandActresses values (20, 20, 'The Marx Brothers', 'Jewish', 'actor ')
insert FamousActorsandActresses values (21, 21, 'Buster Keaton', 'Catholic (non-practicing)', 'actor ')
insert FamousActorsandActresses values (22, 22, 'Sidney Poitier', 'Vodoun; Anglican; Catholic', 'actor ')
insert FamousActorsandActresses values (23, 23, 'Robert Mitchum', 'Methodist (nominal)', 'actor ')
insert FamousActorsandActresses values (24, 24, 'Edward G. Robinson', 'Judaism', 'actor ')
insert FamousActorsandActresses values (25, 25, 'William Holden', 'Congregationalist', 'actor ')
insert FamousActorsandActresses values (26, 1, 'Katharine Hepburn', 'nominal Episcopalian background; atheist', 'actress')
insert FamousActorsandActresses values (27, 2, 'Bette Davis', 'Baptist/Episcopalian family background;', 'actress')
insert FamousActorsandActresses values (27, null, null, 'Christian Science (temporary)', 'actress')
insert FamousActorsandActresses values (28, 3, 'Audrey Hepburn', 'Christian Science', 'actress')
insert FamousActorsandActresses values (29, 4, 'Ingrid Bergman', 'Lutheran', 'actress')
insert FamousActorsandActresses values (30, 5, 'Greta Garbo', 'Lutheran', 'actress')
insert FamousActorsandActresses values (31, 6, 'Marilyn Monroe', 'Christian Science;', 'actress')
insert FamousActorsandActresses values (31, null, null, 'temporary convert to Judaism', 'actress')
insert FamousActorsandActresses values (32, 7, 'Elizabeth Taylor', 'Christian Science;', 'actress')
insert FamousActorsandActresses values (32, null, null, 'convert to Reform Judaism', 'actress')
insert FamousActorsandActresses values (33, 8, 'Judy Garland', 'Episcopalian', 'actress')
insert FamousActorsandActresses values (34, 9, 'Marlene Dietrich', 'mostly nonreligious but superstitious;', 'actress')
insert FamousActorsandActresses values (34, null, null, 'nominal Lutheran background', 'actress')
insert FamousActorsandActresses values (35, 10, 'Joan Crawford', 'raised Catholic;', 'actress')
insert FamousActorsandActresses values (35, null, null, 'Christian Science (convert)', 'actress')
insert FamousActorsandActresses values (36, 11, 'Barbara Stanwyck', 'Protestant (nominal)', 'actress')
insert FamousActorsandActresses values (37, 12, 'Claudette Colbert', 'Catholic', 'actress')
insert FamousActorsandActresses values (38, 13, 'Grace Kelly', 'Catholic', 'actress')
insert FamousActorsandActresses values (39, 14, 'Ginger Rogers', 'Christian Science', 'actress')
insert FamousActorsandActresses values (40, 15, 'Mae West', 'Catholic; Spiritualism', 'actress')
insert FamousActorsandActresses values (41, 16, 'Vivien Leigh', 'Catholic', 'actress')
insert FamousActorsandActresses values (42, 17, 'Lillian Gish', 'Episcopalian', 'actress')
insert FamousActorsandActresses values (43, 18, 'Shirley Temple', 'Presbyterian, Episcopalian', 'actress')
insert FamousActorsandActresses values (44, 19, 'Rita Hayworth', 'Catholic', 'actress')
insert FamousActorsandActresses values (45, 20, 'Lauren Bacall', 'Jewish (non-observant)', 'actress')
insert FamousActorsandActresses values (46, 21, 'Sophia Loren', 'Catholic', 'actress')
insert FamousActorsandActresses values (47, 22, 'Jean Harlow', 'nominal Christian background,', 'actress')
insert FamousActorsandActresses values (47, null, null, 'incl. Christian Science', 'actress')
insert FamousActorsandActresses values (48, 23, 'Carole Lombard', 'Baha''i', 'actress')
insert FamousActorsandActresses values (49, 24, 'Mary Pickford', 'Catholic; Christian Science (convert)', 'actress')
insert FamousActorsandActresses values (50, 24, 'Ava Gardner', 'Southern Baptist', 'actress')
GO
SELECT * FROM FamousActorsandActresses
GO
UPDATE b SET b.Name = a.Name, b.Rank = a.Rank
FROM FamousActorsandActresses a
JOIN FamousActorsandActresses b
ON b.FamousActorsandActressesID = a.FamousActorsandActressesID
WHERE a.Name is not null
and b.Name is null
GO
SELECT * FROM FamousActorsandActresses
GO
|